diff --git a/Assembler/KeplerAs/Install.sh b/Assembler/KeplerAs/Install.sh
new file mode 100755
index 0000000..57c8d24
--- /dev/null
+++ b/Assembler/KeplerAs/Install.sh
@@ -0,0 +1,3 @@
+perl Makefile.PL
+make
+sudo make install
diff --git a/Assembler/KeplerAs/Install_locally.sh b/Assembler/KeplerAs/Install_locally.sh
new file mode 100755
index 0000000..79be922
--- /dev/null
+++ b/Assembler/KeplerAs/Install_locally.sh
@@ -0,0 +1,6 @@
+perl Makefile.PL
+make
+
+#configure the following variables in .bashrc; then source ~/.bashrc
+#export PERL5LIB=/home/xiuxia/PP2017_artifact/KeplerAs/blib/lib/:$PERL5LIB
+#export PATH=/home/xiuxia/PPoPP2017_artifact/KeplerAs/blib/script:$PATH
diff --git a/Assembler/KeplerAs/LICENSE b/Assembler/KeplerAs/LICENSE
new file mode 100644
index 0000000..2c9314c
--- /dev/null
+++ b/Assembler/KeplerAs/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+Copyright (c) 2015~2016 Xiuxia Zhang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Assembler/KeplerAs/MYMETA.json b/Assembler/KeplerAs/MYMETA.json
new file mode 100644
index 0000000..cb97ff4
--- /dev/null
+++ b/Assembler/KeplerAs/MYMETA.json
@@ -0,0 +1,42 @@
+{
+   "abstract" : "Assembler for NVIDIA Maxwell architecture",
+   "author" : [
+      "Xiuxia Zhang <zhangxiuxia1@gmail.com>"
+   ],
+   "dynamic_config" : 0,
+   "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001",
+   "license" : [
+      "mit"
+   ],
+   "meta-spec" : {
+      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+      "version" : "2"
+   },
+   "name" : "KeplerAs-KeplerAs",
+   "no_index" : {
+      "directory" : [
+         "t",
+         "inc"
+      ]
+   },
+   "prereqs" : {
+      "build" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "configure" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "runtime" : {
+         "requires" : {
+            "Carp" : "1.29",
+            "Data::Dumper" : "2.145"
+         }
+      }
+   },
+   "release_status" : "stable",
+   "version" : "1.06"
+}
diff --git a/Assembler/KeplerAs/MYMETA.yml b/Assembler/KeplerAs/MYMETA.yml
new file mode 100644
index 0000000..7a0496d
--- /dev/null
+++ b/Assembler/KeplerAs/MYMETA.yml
@@ -0,0 +1,23 @@
+---
+abstract: 'Assembler for NVIDIA Maxwell architecture'
+author:
+  - 'Xiuxia Zhang <zhangxiuxia1@gmail.com>'
+build_requires:
+  ExtUtils::MakeMaker: '0'
+configure_requires:
+  ExtUtils::MakeMaker: '0'
+dynamic_config: 0
+generated_by: 'ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001'
+license: mit
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: '1.4'
+name: KeplerAs-KeplerAs
+no_index:
+  directory:
+    - t
+    - inc
+requires:
+  Carp: '1.29'
+  Data::Dumper: '2.145'
+version: '1.06'
diff --git a/Assembler/KeplerAs/Makefile b/Assembler/KeplerAs/Makefile
new file mode 100644
index 0000000..f6dc8a8
--- /dev/null
+++ b/Assembler/KeplerAs/Makefile
@@ -0,0 +1,878 @@
+# This Makefile is for the KeplerAs::KeplerAs extension to perl.
+#
+# It was generated automatically by MakeMaker version
+# 7.0401 (Revision: 70401) from the contents of
+# Makefile.PL. Don't edit this file, edit Makefile.PL instead.
+#
+#       ANY CHANGES MADE HERE WILL BE LOST!
+#
+#   MakeMaker ARGV: ()
+#
+
+#   MakeMaker Parameters:
+
+#     ABSTRACT_FROM => q[lib/KeplerAs/KeplerAs.pm]
+#     AUTHOR => [q[Xiuxia Zhang <zhangxiuxia1@gmail.com>]]
+#     BUILD_REQUIRES => {  }
+#     CONFIGURE_REQUIRES => {  }
+#     EXE_FILES => [q[bin/KeplerAs.pl]]
+#     LICENSE => q[MIT]
+#     NAME => q[KeplerAs::KeplerAs]
+#     PREREQ_PM => { Carp=>q[1.29], Data::Dumper=>q[2.145] }
+#     TEST_REQUIRES => {  }
+#     VERSION_FROM => q[lib/KeplerAs/KeplerAs.pm]
+
+# --- MakeMaker post_initialize section:
+
+
+# --- MakeMaker const_config section:
+
+# These definitions are from config.sh (via /usr/lib/x86_64-linux-gnu/perl/5.22/Config.pm).
+# They may have been overridden via Makefile.PL or on the command line.
+AR = ar
+CC = x86_64-linux-gnu-gcc
+CCCDLFLAGS = -fPIC
+CCDLFLAGS = -Wl,-E
+DLEXT = so
+DLSRC = dl_dlopen.xs
+EXE_EXT = 
+FULL_AR = /usr/bin/ar
+LD = x86_64-linux-gnu-gcc
+LDDLFLAGS = -shared -L/usr/local/lib -fstack-protector-strong
+LDFLAGS =  -fstack-protector-strong -L/usr/local/lib
+LIBC = libc-2.21.so
+LIB_EXT = .a
+OBJ_EXT = .o
+OSNAME = linux
+OSVERS = 3.16.0
+RANLIB = :
+SITELIBEXP = /usr/local/share/perl/5.22.1
+SITEARCHEXP = /usr/local/lib/x86_64-linux-gnu/perl/5.22.1
+SO = so
+VENDORARCHEXP = /usr/lib/x86_64-linux-gnu/perl5/5.22
+VENDORLIBEXP = /usr/share/perl5
+
+
+# --- MakeMaker constants section:
+AR_STATIC_ARGS = cr
+DIRFILESEP = /
+DFSEP = $(DIRFILESEP)
+NAME = KeplerAs::KeplerAs
+NAME_SYM = KeplerAs_KeplerAs
+VERSION = 1.06
+VERSION_MACRO = VERSION
+VERSION_SYM = 1_06
+DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\"
+XS_VERSION = 1.06
+XS_VERSION_MACRO = XS_VERSION
+XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\"
+INST_ARCHLIB = blib/arch
+INST_SCRIPT = blib/script
+INST_BIN = blib/bin
+INST_LIB = blib/lib
+INST_MAN1DIR = blib/man1
+INST_MAN3DIR = blib/man3
+MAN1EXT = 1p
+MAN3EXT = 3pm
+INSTALLDIRS = site
+DESTDIR = 
+PREFIX = $(SITEPREFIX)
+PERLPREFIX = /usr
+SITEPREFIX = /usr/local
+VENDORPREFIX = /usr
+INSTALLPRIVLIB = /usr/share/perl/5.22
+DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB)
+INSTALLSITELIB = /usr/local/share/perl/5.22.1
+DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB)
+INSTALLVENDORLIB = /usr/share/perl5
+DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB)
+INSTALLARCHLIB = /usr/lib/x86_64-linux-gnu/perl/5.22
+DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB)
+INSTALLSITEARCH = /usr/local/lib/x86_64-linux-gnu/perl/5.22.1
+DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH)
+INSTALLVENDORARCH = /usr/lib/x86_64-linux-gnu/perl5/5.22
+DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH)
+INSTALLBIN = /usr/bin
+DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN)
+INSTALLSITEBIN = /usr/local/bin
+DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN)
+INSTALLVENDORBIN = /usr/bin
+DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN)
+INSTALLSCRIPT = /usr/bin
+DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT)
+INSTALLSITESCRIPT = /usr/local/bin
+DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT)
+INSTALLVENDORSCRIPT = /usr/bin
+DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT)
+INSTALLMAN1DIR = /usr/share/man/man1
+DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR)
+INSTALLSITEMAN1DIR = /usr/local/man/man1
+DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR)
+INSTALLVENDORMAN1DIR = /usr/share/man/man1
+DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR)
+INSTALLMAN3DIR = /usr/share/man/man3
+DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR)
+INSTALLSITEMAN3DIR = /usr/local/man/man3
+DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR)
+INSTALLVENDORMAN3DIR = /usr/share/man/man3
+DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR)
+PERL_LIB = /usr/share/perl/5.22
+PERL_ARCHLIB = /usr/lib/x86_64-linux-gnu/perl/5.22
+PERL_ARCHLIBDEP = /usr/lib/x86_64-linux-gnu/perl/5.22
+LIBPERL_A = libperl.a
+FIRST_MAKEFILE = Makefile
+MAKEFILE_OLD = Makefile.old
+MAKE_APERL_FILE = Makefile.aperl
+PERLMAINCC = $(CC)
+PERL_INC = /usr/lib/x86_64-linux-gnu/perl/5.22/CORE
+PERL_INCDEP = /usr/lib/x86_64-linux-gnu/perl/5.22/CORE
+PERL = "/usr/bin/perl"
+FULLPERL = "/usr/bin/perl"
+ABSPERL = $(PERL)
+PERLRUN = $(PERL)
+FULLPERLRUN = $(FULLPERL)
+ABSPERLRUN = $(ABSPERL)
+PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
+FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
+ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
+PERL_CORE = 0
+PERM_DIR = 755
+PERM_RW = 644
+PERM_RWX = 755
+
+MAKEMAKER   = /usr/share/perl/5.22/ExtUtils/MakeMaker.pm
+MM_VERSION  = 7.0401
+MM_REVISION = 70401
+
+# FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle).
+# BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle)
+# PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar)
+# DLBASE  = Basename part of dynamic library. May be just equal BASEEXT.
+MAKE = make
+FULLEXT = KeplerAs/KeplerAs
+BASEEXT = KeplerAs
+PARENT_NAME = KeplerAs
+DLBASE = $(BASEEXT)
+VERSION_FROM = lib/KeplerAs/KeplerAs.pm
+OBJECT = 
+LDFROM = $(OBJECT)
+LINKTYPE = dynamic
+BOOTDEP = 
+
+# Handy lists of source code files:
+XS_FILES = 
+C_FILES  = 
+O_FILES  = 
+H_FILES  = 
+MAN1PODS = 
+MAN3PODS = lib/KeplerAs/KeplerAs.pm
+
+# Where is the Config information that we are using/depend on
+CONFIGDEP = $(PERL_ARCHLIBDEP)$(DFSEP)Config.pm $(PERL_INCDEP)$(DFSEP)config.h
+
+# Where to build things
+INST_LIBDIR      = $(INST_LIB)/KeplerAs
+INST_ARCHLIBDIR  = $(INST_ARCHLIB)/KeplerAs
+
+INST_AUTODIR     = $(INST_LIB)/auto/$(FULLEXT)
+INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT)
+
+INST_STATIC      = 
+INST_DYNAMIC     = 
+INST_BOOT        = 
+
+# Extra linker info
+EXPORT_LIST        = 
+PERL_ARCHIVE       = 
+PERL_ARCHIVEDEP    = 
+PERL_ARCHIVE_AFTER = 
+
+
+TO_INST_PM = lib/KeplerAs/Cubin.pm \
+	lib/KeplerAs/KeplerAs.pm \
+	lib/KeplerAs/KeplerAsGrammar.pm
+
+PM_TO_BLIB = lib/KeplerAs/Cubin.pm \
+	blib/lib/KeplerAs/Cubin.pm \
+	lib/KeplerAs/KeplerAs.pm \
+	blib/lib/KeplerAs/KeplerAs.pm \
+	lib/KeplerAs/KeplerAsGrammar.pm \
+	blib/lib/KeplerAs/KeplerAsGrammar.pm
+
+
+# --- MakeMaker platform_constants section:
+MM_Unix_VERSION = 7.0401
+PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc
+
+
+# --- MakeMaker tool_autosplit section:
+# Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto
+AUTOSPLITFILE = $(ABSPERLRUN)  -e 'use AutoSplit;  autosplit($$$$ARGV[0], $$$$ARGV[1], 0, 1, 1)' --
+
+
+
+# --- MakeMaker tool_xsubpp section:
+
+
+# --- MakeMaker tools_other section:
+SHELL = /bin/sh
+CHMOD = chmod
+CP = cp
+MV = mv
+NOOP = $(TRUE)
+NOECHO = @
+RM_F = rm -f
+RM_RF = rm -rf
+TEST_F = test -f
+TOUCH = touch
+UMASK_NULL = umask 0
+DEV_NULL = > /dev/null 2>&1
+MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' --
+EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' --
+FALSE = false
+TRUE = true
+ECHO = echo
+ECHO_N = echo -n
+UNINST = 0
+VERBINST = 0
+MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' --
+DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' --
+UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' --
+WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' --
+MACROSTART = 
+MACROEND = 
+USEMAKEFILE = -f
+FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' --
+CP_NONEMPTY = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'cp_nonempty' --
+
+
+# --- MakeMaker makemakerdflt section:
+makemakerdflt : all
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker dist section:
+TAR = tar
+TARFLAGS = cvf
+ZIP = zip
+ZIPFLAGS = -r
+COMPRESS = gzip --best
+SUFFIX = .gz
+SHAR = shar
+PREOP = $(NOECHO) $(NOOP)
+POSTOP = $(NOECHO) $(NOOP)
+TO_UNIX = $(NOECHO) $(NOOP)
+CI = ci -u
+RCS_LABEL = rcs -Nv$(VERSION_SYM): -q
+DIST_CP = best
+DIST_DEFAULT = tardist
+DISTNAME = KeplerAs-KeplerAs
+DISTVNAME = KeplerAs-KeplerAs-1.06
+
+
+# --- MakeMaker macro section:
+
+
+# --- MakeMaker depend section:
+
+
+# --- MakeMaker cflags section:
+
+
+# --- MakeMaker const_loadlibs section:
+
+
+# --- MakeMaker const_cccmd section:
+
+
+# --- MakeMaker post_constants section:
+
+
+# --- MakeMaker pasthru section:
+
+PASTHRU = LIBPERL_A="$(LIBPERL_A)"\
+	LINKTYPE="$(LINKTYPE)"\
+	LD="$(LD)"\
+	PREFIX="$(PREFIX)"
+
+
+# --- MakeMaker special_targets section:
+.SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT)
+
+.PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir
+
+
+
+# --- MakeMaker c_o section:
+
+
+# --- MakeMaker xs_c section:
+
+
+# --- MakeMaker xs_o section:
+
+
+# --- MakeMaker top_targets section:
+all :: pure_all manifypods
+	$(NOECHO) $(NOOP)
+
+
+pure_all :: config pm_to_blib subdirs linkext
+	$(NOECHO) $(NOOP)
+
+subdirs :: $(MYEXTLIB)
+	$(NOECHO) $(NOOP)
+
+config :: $(FIRST_MAKEFILE) blibdirs
+	$(NOECHO) $(NOOP)
+
+help :
+	perldoc ExtUtils::MakeMaker
+
+
+# --- MakeMaker blibdirs section:
+blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists
+	$(NOECHO) $(NOOP)
+
+# Backwards compat with 6.18 through 6.25
+blibdirs.ts : blibdirs
+	$(NOECHO) $(NOOP)
+
+$(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_LIBDIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR)
+	$(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists
+
+$(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_ARCHLIB)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB)
+	$(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists
+
+$(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_AUTODIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR)
+	$(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists
+
+$(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR)
+	$(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists
+
+$(INST_BIN)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_BIN)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN)
+	$(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists
+
+$(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_SCRIPT)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT)
+	$(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists
+
+$(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_MAN1DIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR)
+	$(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists
+
+$(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_MAN3DIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR)
+	$(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists
+
+
+
+# --- MakeMaker linkext section:
+
+linkext :: $(LINKTYPE)
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker dlsyms section:
+
+
+# --- MakeMaker dynamic_bs section:
+
+BOOTSTRAP =
+
+
+# --- MakeMaker dynamic section:
+
+dynamic :: $(FIRST_MAKEFILE) $(BOOTSTRAP) $(INST_DYNAMIC)
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker dynamic_lib section:
+
+
+# --- MakeMaker static section:
+
+## $(INST_PM) has been moved to the all: target.
+## It remains here for awhile to allow for old usage: "make static"
+static :: $(FIRST_MAKEFILE) $(INST_STATIC)
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker static_lib section:
+
+
+# --- MakeMaker manifypods section:
+
+POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--"
+POD2MAN = $(POD2MAN_EXE)
+
+
+manifypods : pure_all  \
+	lib/KeplerAs/KeplerAs.pm
+	$(NOECHO) $(POD2MAN) --section=$(MAN3EXT) --perm_rw=$(PERM_RW) -u \
+	  lib/KeplerAs/KeplerAs.pm $(INST_MAN3DIR)/KeplerAs::KeplerAs.$(MAN3EXT) 
+
+
+
+
+# --- MakeMaker processPL section:
+
+
+# --- MakeMaker installbin section:
+
+EXE_FILES = bin/KeplerAs.pl
+
+pure_all :: $(INST_SCRIPT)/KeplerAs.pl
+	$(NOECHO) $(NOOP)
+
+realclean ::
+	$(RM_F) \
+	  $(INST_SCRIPT)/KeplerAs.pl 
+
+$(INST_SCRIPT)/KeplerAs.pl : bin/KeplerAs.pl $(FIRST_MAKEFILE) $(INST_SCRIPT)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists
+	$(NOECHO) $(RM_F) $(INST_SCRIPT)/KeplerAs.pl
+	$(CP) bin/KeplerAs.pl $(INST_SCRIPT)/KeplerAs.pl
+	$(FIXIN) $(INST_SCRIPT)/KeplerAs.pl
+	-$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_SCRIPT)/KeplerAs.pl
+
+
+
+# --- MakeMaker subdirs section:
+
+# none
+
+# --- MakeMaker clean_subdirs section:
+clean_subdirs :
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker clean section:
+
+# Delete temporary files but do not touch installed files. We don't delete
+# the Makefile here so a later make realclean still has a makefile to use.
+
+clean :: clean_subdirs
+	- $(RM_F) \
+	  $(BASEEXT).bso $(BASEEXT).def \
+	  $(BASEEXT).exp $(BASEEXT).x \
+	  $(BOOTSTRAP) $(INST_ARCHAUTODIR)/extralibs.all \
+	  $(INST_ARCHAUTODIR)/extralibs.ld $(MAKE_APERL_FILE) \
+	  *$(LIB_EXT) *$(OBJ_EXT) \
+	  *perl.core MYMETA.json \
+	  MYMETA.yml blibdirs.ts \
+	  core core.*perl.*.? \
+	  core.[0-9] core.[0-9][0-9] \
+	  core.[0-9][0-9][0-9] core.[0-9][0-9][0-9][0-9] \
+	  core.[0-9][0-9][0-9][0-9][0-9] lib$(BASEEXT).def \
+	  mon.out perl \
+	  perl$(EXE_EXT) perl.exe \
+	  perlmain.c pm_to_blib \
+	  pm_to_blib.ts so_locations \
+	  tmon.out 
+	- $(RM_RF) \
+	  blib 
+	  $(NOECHO) $(RM_F) $(MAKEFILE_OLD)
+	- $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL)
+
+
+# --- MakeMaker realclean_subdirs section:
+realclean_subdirs :
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker realclean section:
+# Delete temporary files (via clean) and also delete dist files
+realclean purge ::  clean realclean_subdirs
+	- $(RM_F) \
+	  $(MAKEFILE_OLD) $(FIRST_MAKEFILE) 
+	- $(RM_RF) \
+	  $(DISTVNAME) 
+
+
+# --- MakeMaker metafile section:
+metafile : create_distdir
+	$(NOECHO) $(ECHO) Generating META.yml
+	$(NOECHO) $(ECHO) '---' > META_new.yml
+	$(NOECHO) $(ECHO) 'abstract: '\''Assembler for NVIDIA Maxwell architecture'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'author:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  - '\''Xiuxia Zhang <zhangxiuxia1@gmail.com>'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  ExtUtils::MakeMaker: '\''0'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  ExtUtils::MakeMaker: '\''0'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'dynamic_config: 1' >> META_new.yml
+	$(NOECHO) $(ECHO) 'generated_by: '\''ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'license: mit' >> META_new.yml
+	$(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  url: http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml
+	$(NOECHO) $(ECHO) '  version: '\''1.4'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'name: KeplerAs-KeplerAs' >> META_new.yml
+	$(NOECHO) $(ECHO) 'no_index:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  directory:' >> META_new.yml
+	$(NOECHO) $(ECHO) '    - t' >> META_new.yml
+	$(NOECHO) $(ECHO) '    - inc' >> META_new.yml
+	$(NOECHO) $(ECHO) 'requires:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  Carp: '\''1.29'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) '  Data::Dumper: '\''2.145'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'version: '\''1.06'\''' >> META_new.yml
+	-$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml
+	$(NOECHO) $(ECHO) Generating META.json
+	$(NOECHO) $(ECHO) '{' > META_new.json
+	$(NOECHO) $(ECHO) '   "abstract" : "Assembler for NVIDIA Maxwell architecture",' >> META_new.json
+	$(NOECHO) $(ECHO) '   "author" : [' >> META_new.json
+	$(NOECHO) $(ECHO) '      "Xiuxia Zhang <zhangxiuxia1@gmail.com>"' >> META_new.json
+	$(NOECHO) $(ECHO) '   ],' >> META_new.json
+	$(NOECHO) $(ECHO) '   "dynamic_config" : 1,' >> META_new.json
+	$(NOECHO) $(ECHO) '   "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001",' >> META_new.json
+	$(NOECHO) $(ECHO) '   "license" : [' >> META_new.json
+	$(NOECHO) $(ECHO) '      "mit"' >> META_new.json
+	$(NOECHO) $(ECHO) '   ],' >> META_new.json
+	$(NOECHO) $(ECHO) '   "meta-spec" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",' >> META_new.json
+	$(NOECHO) $(ECHO) '      "version" : "2"' >> META_new.json
+	$(NOECHO) $(ECHO) '   },' >> META_new.json
+	$(NOECHO) $(ECHO) '   "name" : "KeplerAs-KeplerAs",' >> META_new.json
+	$(NOECHO) $(ECHO) '   "no_index" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '      "directory" : [' >> META_new.json
+	$(NOECHO) $(ECHO) '         "t",' >> META_new.json
+	$(NOECHO) $(ECHO) '         "inc"' >> META_new.json
+	$(NOECHO) $(ECHO) '      ]' >> META_new.json
+	$(NOECHO) $(ECHO) '   },' >> META_new.json
+	$(NOECHO) $(ECHO) '   "prereqs" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '      "build" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '         "requires" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '            "ExtUtils::MakeMaker" : "0"' >> META_new.json
+	$(NOECHO) $(ECHO) '         }' >> META_new.json
+	$(NOECHO) $(ECHO) '      },' >> META_new.json
+	$(NOECHO) $(ECHO) '      "configure" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '         "requires" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '            "ExtUtils::MakeMaker" : "0"' >> META_new.json
+	$(NOECHO) $(ECHO) '         }' >> META_new.json
+	$(NOECHO) $(ECHO) '      },' >> META_new.json
+	$(NOECHO) $(ECHO) '      "runtime" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '         "requires" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '            "Carp" : "1.29",' >> META_new.json
+	$(NOECHO) $(ECHO) '            "Data::Dumper" : "2.145"' >> META_new.json
+	$(NOECHO) $(ECHO) '         }' >> META_new.json
+	$(NOECHO) $(ECHO) '      }' >> META_new.json
+	$(NOECHO) $(ECHO) '   },' >> META_new.json
+	$(NOECHO) $(ECHO) '   "release_status" : "stable",' >> META_new.json
+	$(NOECHO) $(ECHO) '   "version" : "1.06"' >> META_new.json
+	$(NOECHO) $(ECHO) '}' >> META_new.json
+	-$(NOECHO) $(MV) META_new.json $(DISTVNAME)/META.json
+
+
+# --- MakeMaker signature section:
+signature :
+	cpansign -s
+
+
+# --- MakeMaker dist_basics section:
+distclean :: realclean distcheck
+	$(NOECHO) $(NOOP)
+
+distcheck :
+	$(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck
+
+skipcheck :
+	$(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck
+
+manifest :
+	$(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest
+
+veryclean : realclean
+	$(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old
+
+
+
+# --- MakeMaker dist_core section:
+
+dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE)
+	$(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \
+	  -e '    if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' --
+
+tardist : $(DISTVNAME).tar$(SUFFIX)
+	$(NOECHO) $(NOOP)
+
+uutardist : $(DISTVNAME).tar$(SUFFIX)
+	uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu
+	$(NOECHO) $(ECHO) 'Created $(DISTVNAME).tar$(SUFFIX)_uu'
+
+$(DISTVNAME).tar$(SUFFIX) : distdir
+	$(PREOP)
+	$(TO_UNIX)
+	$(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME)
+	$(RM_RF) $(DISTVNAME)
+	$(COMPRESS) $(DISTVNAME).tar
+	$(NOECHO) $(ECHO) 'Created $(DISTVNAME).tar$(SUFFIX)'
+	$(POSTOP)
+
+zipdist : $(DISTVNAME).zip
+	$(NOECHO) $(NOOP)
+
+$(DISTVNAME).zip : distdir
+	$(PREOP)
+	$(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME)
+	$(RM_RF) $(DISTVNAME)
+	$(NOECHO) $(ECHO) 'Created $(DISTVNAME).zip'
+	$(POSTOP)
+
+shdist : distdir
+	$(PREOP)
+	$(SHAR) $(DISTVNAME) > $(DISTVNAME).shar
+	$(RM_RF) $(DISTVNAME)
+	$(NOECHO) $(ECHO) 'Created $(DISTVNAME).shar'
+	$(POSTOP)
+
+
+# --- MakeMaker distdir section:
+create_distdir :
+	$(RM_RF) $(DISTVNAME)
+	$(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \
+		-e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');"
+
+distdir : create_distdir distmeta 
+	$(NOECHO) $(NOOP)
+
+
+
+# --- MakeMaker dist_test section:
+disttest : distdir
+	cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL 
+	cd $(DISTVNAME) && $(MAKE) $(PASTHRU)
+	cd $(DISTVNAME) && $(MAKE) test $(PASTHRU)
+
+
+
+# --- MakeMaker dist_ci section:
+
+ci :
+	$(PERLRUN) "-MExtUtils::Manifest=maniread" \
+	  -e "@all = keys %{ maniread() };" \
+	  -e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \
+	  -e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});"
+
+
+# --- MakeMaker distmeta section:
+distmeta : create_distdir metafile
+	$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'exit unless -e q{META.yml};' \
+	  -e 'eval { maniadd({q{META.yml} => q{Module YAML meta-data (added by MakeMaker)}}) }' \
+	  -e '    or print "Could not add META.yml to MANIFEST: $$$${'\''@'\''}\n"' --
+	$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'exit unless -f q{META.json};' \
+	  -e 'eval { maniadd({q{META.json} => q{Module JSON meta-data (added by MakeMaker)}}) }' \
+	  -e '    or print "Could not add META.json to MANIFEST: $$$${'\''@'\''}\n"' --
+
+
+
+# --- MakeMaker distsignature section:
+distsignature : create_distdir
+	$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) }' \
+	  -e '    or print "Could not add SIGNATURE to MANIFEST: $$$${'\''@'\''}\n"' --
+	$(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE
+	cd $(DISTVNAME) && cpansign -s
+
+
+
+# --- MakeMaker install section:
+
+install :: pure_install doc_install
+	$(NOECHO) $(NOOP)
+
+install_perl :: pure_perl_install doc_perl_install
+	$(NOECHO) $(NOOP)
+
+install_site :: pure_site_install doc_site_install
+	$(NOECHO) $(NOOP)
+
+install_vendor :: pure_vendor_install doc_vendor_install
+	$(NOECHO) $(NOOP)
+
+pure_install :: pure_$(INSTALLDIRS)_install
+	$(NOECHO) $(NOOP)
+
+doc_install :: doc_$(INSTALLDIRS)_install
+	$(NOECHO) $(NOOP)
+
+pure__install : pure_site_install
+	$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
+
+doc__install : doc_site_install
+	$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
+
+pure_perl_install :: all
+	$(NOECHO) umask 022; $(MOD_INSTALL) \
+		"$(INST_LIB)" "$(DESTINSTALLPRIVLIB)" \
+		"$(INST_ARCHLIB)" "$(DESTINSTALLARCHLIB)" \
+		"$(INST_BIN)" "$(DESTINSTALLBIN)" \
+		"$(INST_SCRIPT)" "$(DESTINSTALLSCRIPT)" \
+		"$(INST_MAN1DIR)" "$(DESTINSTALLMAN1DIR)" \
+		"$(INST_MAN3DIR)" "$(DESTINSTALLMAN3DIR)"
+	$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
+		"$(SITEARCHEXP)/auto/$(FULLEXT)"
+
+
+pure_site_install :: all
+	$(NOECHO) umask 02; $(MOD_INSTALL) \
+		read "$(SITEARCHEXP)/auto/$(FULLEXT)/.packlist" \
+		write "$(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist" \
+		"$(INST_LIB)" "$(DESTINSTALLSITELIB)" \
+		"$(INST_ARCHLIB)" "$(DESTINSTALLSITEARCH)" \
+		"$(INST_BIN)" "$(DESTINSTALLSITEBIN)" \
+		"$(INST_SCRIPT)" "$(DESTINSTALLSITESCRIPT)" \
+		"$(INST_MAN1DIR)" "$(DESTINSTALLSITEMAN1DIR)" \
+		"$(INST_MAN3DIR)" "$(DESTINSTALLSITEMAN3DIR)"
+	$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
+		"$(PERL_ARCHLIB)/auto/$(FULLEXT)"
+
+pure_vendor_install :: all
+	$(NOECHO) umask 022; $(MOD_INSTALL) \
+		"$(INST_LIB)" "$(DESTINSTALLVENDORLIB)" \
+		"$(INST_ARCHLIB)" "$(DESTINSTALLVENDORARCH)" \
+		"$(INST_BIN)" "$(DESTINSTALLVENDORBIN)" \
+		"$(INST_SCRIPT)" "$(DESTINSTALLVENDORSCRIPT)" \
+		"$(INST_MAN1DIR)" "$(DESTINSTALLVENDORMAN1DIR)" \
+		"$(INST_MAN3DIR)" "$(DESTINSTALLVENDORMAN3DIR)"
+
+
+doc_perl_install :: all
+
+doc_site_install :: all
+	$(NOECHO) $(ECHO) Appending installation info to "$(DESTINSTALLSITEARCH)/perllocal.pod"
+	-$(NOECHO) umask 02; $(MKPATH) "$(DESTINSTALLSITEARCH)"
+	-$(NOECHO) umask 02; $(DOC_INSTALL) \
+		"Module" "$(NAME)" \
+		"installed into" $(INSTALLSITELIB) \
+		LINKTYPE "$(LINKTYPE)" \
+		VERSION "$(VERSION)" \
+		EXE_FILES "$(EXE_FILES)" \
+		>> "$(DESTINSTALLSITEARCH)/perllocal.pod"
+
+doc_vendor_install :: all
+
+
+uninstall :: uninstall_from_$(INSTALLDIRS)dirs
+	$(NOECHO) $(NOOP)
+
+uninstall_from_perldirs ::
+
+uninstall_from_sitedirs ::
+	$(NOECHO) $(UNINSTALL) "$(SITEARCHEXP)/auto/$(FULLEXT)/.packlist"
+
+uninstall_from_vendordirs ::
+
+
+# --- MakeMaker force section:
+# Phony target to force checking subdirectories.
+FORCE :
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker perldepend section:
+
+
+# --- MakeMaker makefile section:
+# We take a very conservative approach here, but it's worth it.
+# We move Makefile to Makefile.old here to avoid gnu make looping.
+$(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP)
+	$(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?"
+	$(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..."
+	-$(NOECHO) $(RM_F) $(MAKEFILE_OLD)
+	-$(NOECHO) $(MV)   $(FIRST_MAKEFILE) $(MAKEFILE_OLD)
+	- $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL)
+	$(PERLRUN) Makefile.PL 
+	$(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <=="
+	$(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command.  <=="
+	$(FALSE)
+
+
+
+# --- MakeMaker staticmake section:
+
+# --- MakeMaker makeaperl section ---
+MAP_TARGET    = perl
+FULLPERL      = "/usr/bin/perl"
+
+$(MAP_TARGET) :: static $(MAKE_APERL_FILE)
+	$(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@
+
+$(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib
+	$(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET)
+	$(NOECHO) $(PERLRUNINST) \
+		Makefile.PL DIR="" \
+		MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \
+		MAKEAPERL=1 NORECURS=1 CCCDLFLAGS=
+
+
+# --- MakeMaker test section:
+
+TEST_VERBOSE=0
+TEST_TYPE=test_$(LINKTYPE)
+TEST_FILE = test.pl
+TEST_FILES = 
+TESTDB_SW = -d
+
+testdb :: testdb_$(LINKTYPE)
+
+test :: $(TEST_TYPE) subdirs-test
+
+subdirs-test ::
+	$(NOECHO) $(NOOP)
+
+	$(NOECHO) $(ECHO) 'No tests defined for $(NAME) extension.'
+
+test_dynamic :: pure_all
+
+testdb_dynamic :: pure_all
+	PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
+
+test_ : test_dynamic
+
+test_static :: test_dynamic
+testdb_static :: testdb_dynamic
+
+
+# --- MakeMaker ppd section:
+# Creates a PPD (Perl Package Description) for a binary distribution.
+ppd :
+	$(NOECHO) $(ECHO) '<SOFTPKG NAME="$(DISTNAME)" VERSION="$(VERSION)">' > $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    <ABSTRACT>Assembler for NVIDIA Maxwell architecture</ABSTRACT>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    <AUTHOR>Xiuxia Zhang &lt;zhangxiuxia1@gmail.com&gt;</AUTHOR>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    <IMPLEMENTATION>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <REQUIRE NAME="Carp::" VERSION="1.29" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <REQUIRE NAME="Data::Dumper" VERSION="2.145" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <ARCHITECTURE NAME="x86_64-linux-gnu-thread-multi-5.22" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <CODEBASE HREF="" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    </IMPLEMENTATION>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '</SOFTPKG>' >> $(DISTNAME).ppd
+
+
+# --- MakeMaker pm_to_blib section:
+
+pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM)
+	$(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \
+	  lib/KeplerAs/Cubin.pm blib/lib/KeplerAs/Cubin.pm \
+	  lib/KeplerAs/KeplerAs.pm blib/lib/KeplerAs/KeplerAs.pm \
+	  lib/KeplerAs/KeplerAsGrammar.pm blib/lib/KeplerAs/KeplerAsGrammar.pm 
+	$(NOECHO) $(TOUCH) pm_to_blib
+
+
+# --- MakeMaker selfdocument section:
+
+
+# --- MakeMaker postamble section:
+
+
+# End.
diff --git a/Assembler/KeplerAs/Makefile.PL b/Assembler/KeplerAs/Makefile.PL
new file mode 100644
index 0000000..4f71756
--- /dev/null
+++ b/Assembler/KeplerAs/Makefile.PL
@@ -0,0 +1,14 @@
+require 5.10.0;
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+    NAME              => 'KeplerAs::KeplerAs',
+    VERSION_FROM      => 'lib/KeplerAs/KeplerAs.pm', # finds $VERSION
+    EXE_FILES         => ['bin/KeplerAs.pl'],
+    PREREQ_PM         => {Carp => 1.29, Data::Dumper => 2.145},
+    LICENSE           => 'MIT',
+    ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
+      (ABSTRACT_FROM  => 'lib/KeplerAs/KeplerAs.pm', # retrieve abstract from module
+       AUTHOR         => 'Xiuxia Zhang <zhangxiuxia1@gmail.com>') : ()),
+);
diff --git a/Assembler/KeplerAs/README.md b/Assembler/KeplerAs/README.md
new file mode 100644
index 0000000..7b4a8ee
--- /dev/null
+++ b/Assembler/KeplerAs/README.md
@@ -0,0 +1,7 @@
+##Kepler GPU assembler: KeplerAs
+
+Our KeplerAs is based on Maxas(for Maxwell and Pascal GPU). 
+Kepler use a completely different ISA incodings compared with Maxwell GPU.
+We use the ISA encoding information cracked by our solver.
+
+Install.sh is script to install the software.
diff --git a/Assembler/KeplerAs/bin/KeplerAs.pl b/Assembler/KeplerAs/bin/KeplerAs.pl
new file mode 100755
index 0000000..268cc85
--- /dev/null
+++ b/Assembler/KeplerAs/bin/KeplerAs.pl
@@ -0,0 +1,275 @@
+#!/usr/bin/perl
+use strict;
+use KeplerAs::Cubin;
+use KeplerAs::KeplerAs;
+use Data::Dumper;
+use File::Spec;
+
+require 5.10.0;
+
+$Data::Dumper::Sortkeys = 1;
+
+my $mode = shift;
+
+if ($mode =~ /^\-?\-l/i)
+{
+    my $cubinFile = shift or usage();
+
+    my $cubin = KeplerAs::Cubin->new($cubinFile);
+
+    my $arch    = $cubin->arch;
+    my $class   = $cubin->class;
+    my $asize   = $cubin->address_size;
+    my $kernels = $cubin->listKernels;
+    my $symbols = $cubin->listSymbols;
+
+    printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize;
+
+    foreach my $ker (sort keys %$kernels)
+    {
+        printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)};
+    }
+    foreach my $sym (sort keys %$symbols)
+    {
+        printf "Symbol: %s\n", $sym;
+    }
+}
+elsif ($mode =~ /^\-?\-t/i)
+{
+    my $reg  = shift if $ARGV[0] =~ /^\-?\-r/i;
+    my $all  = shift if $ARGV[0] =~ /^\-?\-a/i;
+    my $file = shift or usage();
+    my $fh;
+    if (-T $file)
+    {
+        open $fh, $file or die "$file: $!";
+    }
+    else
+    {
+        my $cubin = KeplerAs::Cubin->new($file);
+        my $arch  = $cubin->arch;
+
+        open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!";
+        my $first = <$fh>;
+        if ($first =~ /cuobjdump fatal/)
+        {
+            print $first;
+            exit(1);
+        }
+    }
+    exit(KeplerAs::KeplerAs::Test($fh, $reg, $all) ? 1 : 0);
+}
+elsif ($mode =~ /^\-?\-e/i)
+{
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $cubinFile = shift or usage();
+    my $asmFile   = shift;
+    my $cubin     = KeplerAs::Cubin->new($cubinFile);
+    my $arch      = $cubin->arch;
+    my $kernels   = $cubin->listKernels;
+
+    $kernelName ||= (sort keys %$kernels)[0];
+
+    my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName";
+
+    open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_35 -sass -fun $kernelName $cubinFile: $!";
+    my $first = <$in>;
+    if ($first =~ /cuobjdump fatal/)
+    {
+        print $first;
+        exit(1);
+    }
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n";
+
+    print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt));
+
+    print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n";
+
+    print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params};
+
+    print $out "#\n# Instructions:\n\n";
+
+    KeplerAs::KeplerAs::Extract($in, $out, $kernel->{Params});
+
+    close $out if $asmFile;
+    close $in;
+}
+elsif ($mode =~ /^\-?\-s/i)
+{
+    my $sassFile  = shift or usage();
+    my $asmFile   = shift;
+
+    open my $in, $sassFile or die "$sassFile: $!";
+
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    KeplerAs::KeplerAs::Extract($in, $out, []);
+
+    close $out if $asmFile;
+    close $in;
+}
+elsif ($mode =~ /^\-?\-i/i)
+{
+    my $nowarn;
+    if ($ARGV[0] =~ /^\-?\-w/i)
+    {
+        $nowarn = shift;
+    }
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $noReuse   = shift if $ARGV[0] =~ /^\-?\-n/i;
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package KeplerAs::KeplerAs::CODE; our \$$name = '$value';"
+    }
+
+    my $asmFile   = shift or usage();
+    my $cubinFile = shift or usage();
+    my $newCubin  = shift || $cubinFile;
+
+    my $file;
+    if (open my $fh, $asmFile)
+    {
+        local $/;
+        $file = <$fh>;
+        close $fh;
+    }
+    else { die "$asmFile: $!" }
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName;
+    die "asm file missing kernel name or is badly formatted" unless $kernelName;
+
+    my $kernel = KeplerAs::KeplerAs::Assemble($file, $include, !$noReuse, $nowarn);
+
+    my $cubin  = KeplerAs::Cubin->new($cubinFile);
+    $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName";
+
+    $cubin->modifyKernel(%$kernel);
+
+    $cubin->write($newCubin);
+
+    printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n",
+        @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)};
+
+}
+elsif ($mode =~ /^\-?\-p/i)
+{
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package KeplerAs::KeplerAs::CODE; our \$$name = '$value';";
+    }
+    my $debug     = shift if $ARGV[0] =~ /^\-?\-d/i;
+    my $asmFile   = shift or usage();
+    my $asmFile2  = shift;
+
+    die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2;
+
+    open my $fh,  $asmFile or die "$asmFile: $!";
+    local $/;
+    my $file = <$fh>;
+    close $fh;
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    if ($asmFile2)
+    {
+        open $fh, ">$asmFile2" or die "$asmFile2: $!";
+    }
+    else
+    {
+        $fh = \*STDOUT;
+    }
+    print $fh KeplerAs::KeplerAs::Preprocess($file, $include, $debug);
+    close $fh;
+}
+elsif ($mode =~ /^\-?\-v/i)
+{
+    print "$KeplerAs::KeplerAs::VERSION\n";
+}
+else
+{
+    print "$mode\n";
+    usage();
+}
+
+exit(0);
+
+
+
+sub usage
+{
+    print <<EOF;
+Usage:
+
+  List kernels and symbols:
+
+    KeplerAs.pl --list|-l <cubin_file>
+
+  Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes.
+  Also useful for extending the missing grammar rules.  Defaults to only showing failures without --all.
+  With the --reg flag it will show register bank conflicts not hidden by reuse flags.
+
+    KeplerAs.pl --test|-t [--reg|-r] [--all|-a] <cubin_file | cuobjdump_sass_file>
+
+  Extract a single kernel into an asm file from a cubin.
+  Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin.
+
+    KeplerAs.pl --extract|-e [--kernel|-k kernel_name] <cubin_file> [asm_file]
+
+  Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes.
+  Include the debug flag to print out detailed scheduler info.
+
+    KeplerAs.pl --pre|-p [--debug|-d] <asm_file> [new_asm_file]
+
+  Insert the kernel asm back into the cubin.  Overwrite existing or create new cubin.
+  Optionally you can skip register reuse flag auto insertion.  This allows you to observe
+  performance without any reuse or you can use it to set the flags manually in your sass.
+
+    KeplerAs.pl --insert|-i [--noreuse|-n] <asm_file> <cubin_file> [new_cubin_file]
+
+  Display version information and exit:
+
+    KeplerAs.pl --version|-v
+
+EOF
+    exit(1);
+}
+
+__END__
diff --git a/Assembler/KeplerAs/blib/arch/.exists b/Assembler/KeplerAs/blib/arch/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/KeplerAs/blib/arch/auto/KeplerAs/KeplerAs/.exists b/Assembler/KeplerAs/blib/arch/auto/KeplerAs/KeplerAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/KeplerAs/blib/bin/.exists b/Assembler/KeplerAs/blib/bin/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/KeplerAs/blib/lib/KeplerAs/.exists b/Assembler/KeplerAs/blib/lib/KeplerAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/KeplerAs/blib/lib/KeplerAs/Cubin.pm b/Assembler/KeplerAs/blib/lib/KeplerAs/Cubin.pm
new file mode 100644
index 0000000..867342d
--- /dev/null
+++ b/Assembler/KeplerAs/blib/lib/KeplerAs/Cubin.pm
@@ -0,0 +1,604 @@
+package KeplerAs::Cubin;
+
+use strict;
+use Data::Dumper;
+
+my @Elf32_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    L   entry
+    L   phOffset
+    L   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf64_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    Q   entry
+    Q   phOffset
+    Q   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf32_PrgHdr = qw(
+    L   type
+    L   offset
+    L   vaddr
+    L   paddr
+    L   fileSize
+    L   memSize
+    L   flags
+    L   align
+);
+my @Elf64_PrgHdr = qw(
+    L   type
+    L   flags
+    Q   offset
+    Q   vaddr
+    Q   paddr
+    Q   fileSize
+    Q   memSize
+    Q   align
+);
+my @Elf32_SecHdr = qw(
+    L   name
+    L   type
+    L   flags
+    L   addr
+    L   offset
+    L   size
+    L   link
+    L   info
+    L   align
+    L   entSize
+);
+my @Elf64_SecHdr = qw(
+    L   name
+    L   type
+    Q   flags
+    Q   addr
+    Q   offset
+    Q   size
+    L   link
+    L   info
+    Q   align
+    Q   entSize
+);
+my @Elf32_SymEnt = qw(
+    L   name
+    L   value
+    L   size
+    C   info
+    C   other
+    S   shIndx
+);
+my @Elf64_SymEnt = qw(
+    L   name
+    C   info
+    C   other
+    S   shIndx
+    Q   value
+    Q   size
+);
+my @symBind = qw(LOCAL GLOBAL WEAK);
+
+my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC);
+
+$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr;
+$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr;
+$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr;
+$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt;
+
+$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr;
+$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr;
+$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr;
+$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt;
+
+$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr    ];
+$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ];
+$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ];
+$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ];
+
+$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr    ];
+$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ];
+$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ];
+$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ];
+
+sub new
+{
+    my ($package, $file) = @_;
+
+    my $cubin = bless { fileName => $file }, $package;
+
+    open my $fh, $file or die "$file: $!";
+    binmode($fh);
+
+    my $data;
+    read $fh, $data, 0x34;
+    my $elfHdr = $cubin->{elfHdr} = {};
+    @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data;
+
+    my $class = $elfHdr->{fileClass};
+
+    if ($class == 2)
+    {
+        seek $fh, 0, 0;
+        read $fh, $data, 0x46;
+        @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data;
+
+        $cubin->{Class} = 64;
+    }
+    else
+    {
+        $cubin->{Class} = 32;
+    }
+
+    $cubin->{Arch} = "35";
+    die "Cubin not in sm_35. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} != 35;
+    $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32;
+
+    seek $fh, $elfHdr->{phOffset}, 0;
+    foreach (1 .. $elfHdr->{phNum})
+    {
+        read $fh, $data, $elfHdr->{phEntSize};
+
+        my %prgHdr = (Indx => $_ - 1);
+        @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data;
+        push @{$cubin->{prgHdrs}}, \%prgHdr;
+    }
+
+    seek $fh, $elfHdr->{shOffset}, 0;
+    foreach (1 .. $elfHdr->{shNum})
+    {
+        read $fh, $data, $elfHdr->{shEntSize};
+
+        my %secHdr = (Indx => $_ - 1);
+        @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data;
+        push @{$cubin->{secHdrs}}, \%secHdr;
+    }
+
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $data = '';
+        if ($secHdr->{size} && $secHdr->{type} != 8)
+        {
+            seek $fh, $secHdr->{offset}, 0;
+            read $fh, $data, $secHdr->{size};
+        }
+        if ($secHdr->{type} == 3) # STRTAB
+        {
+            my $strTab = $secHdr->{StrTab} = {};
+            my $indx   = 0;
+            foreach my $str (split "\0", $data)
+            {
+                $strTab->{$indx} = $str;
+                $indx += 1 + length($str);
+            }
+        }
+        if ($secHdr->{type} == 2) # SYMTAB
+        {
+            my $offset = 0;
+            while ($offset < $secHdr->{size})
+            {
+                my $symEnt = {};
+                @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize});
+                $offset += $secHdr->{entSize};
+
+                push @{$secHdr->{SymTab}}, $symEnt;
+            }
+        }
+        $secHdr->{Data} = unpack 'H*', $data;
+    }
+    close $fh;
+
+    my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab};
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $secHdr->{Name} = $shStrTab->{$secHdr->{name}};
+        $cubin->{$secHdr->{Name}} = $secHdr;
+    }
+
+    my $strTab = $cubin->{'.strtab'}{StrTab};
+    foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}})
+    {
+        $symEnt->{Name} = $strTab->{$symEnt->{name}};
+
+        my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}];
+        $secHdr->{SymbolEnt} = $symEnt;
+
+        if (($symEnt->{info} & 0x0f) == 0x02)
+        {
+            my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr;
+
+            $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4];
+
+            $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ];
+
+            $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20;
+
+            $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24;
+
+            my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"};
+            $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0;
+
+            $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"};
+
+            my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"};
+            if ($paramSec)
+            {
+                my @data = unpack "L*", pack "H*", $paramSec->{Data};
+
+                $paramSec->{ParamData} = \@data;
+                $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ];
+
+                my $idx = 0;
+                $idx++ while $idx < @data && $data[$idx] != 0x00080a04;
+
+                my $first = $data[$idx+2] & 0xFFFF;
+                $idx += 4;
+
+                my @params;
+                while ($idx < @data && $data[$idx] == 0x000c1704)
+                {
+                    my $ord    = $data[$idx+2] & 0xFFFF;
+                    my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16);
+                    my $psize  = $data[$idx+3] >> 18;
+                    my $align  = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0;
+                    unshift @params, "$ord:$offset:$psize:$align";
+                    $idx += 4;
+                }
+                my @staticParams = @data[0 .. ($idx-1)];
+
+
+                my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize);
+                while ($idx < @data)
+                {
+                    my $code = $data[$idx] & 0xffff;
+                    my $size = $data[$idx] >> 16;
+                    $idx++;
+
+
+                    if ($code == 0x1b03)
+                    {
+                        $maxregCount = $size;
+                    }
+                    elsif ($code == 0x1d04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @ctaidOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    elsif ($code == 0x1c04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @exitOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    elsif ($code == 0x0401)
+                    {
+                        $ctaidzUsed = 1;
+                    }
+                    elsif ($code == 0x1004)
+                    {
+                        while ($size > 0)
+                        {
+                            push @reqntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    elsif ($code == 0x0504)
+                    {
+                        while ($size > 0)
+                        {
+                            push @maxntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    elsif ($code == 0x1e04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @stackSize, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    else
+                    {
+                        printf "Unknown Code 0x%02x (size:%d)\n", $code, $size;
+                    }
+                }
+                $kernelSec->{Params}   = \@params;
+                $kernelSec->{ParamCnt} = scalar @params;
+
+                $paramSec->{StaticParams} = \@staticParams;
+                $paramSec->{MAXREG_COUNT} = $maxregCount;
+                $paramSec->{ExitOffsets}  = \@exitOffsets;
+                $paramSec->{CTAIDOffsets} = \@ctaidOffsets;
+                $paramSec->{CTAIDZUsed}   = $ctaidzUsed;
+                $paramSec->{REQNTID}      = \@reqntid;
+                $paramSec->{MAXNTID}      = \@maxntid;
+                $paramSec->{STACKSIZE}    = \@stackSize;
+            }
+        }
+        elsif (($symEnt->{info} & 0x10) == 0x10)
+        {
+            $cubin->{Symbols}{$symEnt->{Name}} = $symEnt;
+        }
+    }
+
+
+    return $cubin;
+}
+sub class
+{
+    return shift()->{Class};
+}
+sub arch
+{
+    return shift()->{Arch};
+}
+sub address_size
+{
+    return shift()->{AddressSize};
+}
+sub listKernels
+{
+    return shift()->{Kernels};
+}
+sub listSymbols
+{
+    return shift()->{Symbols};
+}
+sub getKernel
+{
+    my ($cubin, $kernel) = @_;
+    return $cubin->{Kernels}{$kernel};
+}
+
+sub modifyKernel
+{
+    my ($cubin, %params) = @_;
+
+    my $kernelSec    = $params{Kernel};
+    my $newReg       = $params{RegCnt};
+    my $newBar       = $params{BarCnt};
+    my $exitOffsets  = $params{ExitOffsets};
+    my $ctaidOffsets = $params{CTAIDOffsets};
+    my $ctaidzUsed   = $params{CTAIDZUsed};
+    my $newData      = $params{KernelData};
+    my $newSize      = @$newData * 8;
+
+    die "255 register max" if $newReg > 255;
+    die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63;
+    die "16 is max barrier count" if $newBar > 16;
+
+    my $paramSec = $kernelSec->{ParamSec};
+    my $kernelName = $kernelSec->{SymbolEnt}{Name};
+    my $maxregCount = $paramSec->{MAXREG_COUNT};
+    my $stackSize   = $paramSec->{STACKSIZE};
+
+    $kernelSec->{KernelData} = $newData;
+    $kernelSec->{Data}       = unpack "H*", pack "Q*", @$newData;
+
+    if ($newReg != $kernelSec->{RegCnt})
+    {
+        print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n";
+        $kernelSec->{RegCnt} = $newReg;
+        $kernelSec->{info}  &= ~0xff000000;
+        $kernelSec->{info}  |= $newReg << 24;
+    }
+    if ($newBar != $kernelSec->{BarCnt})
+    {
+        print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n";
+        $kernelSec->{BarCnt} = $newBar;
+        $kernelSec->{flags} &= ~0x01f00000;
+        $kernelSec->{flags} |=  $newBar << 20;
+    }
+
+    my @paramData = @{$paramSec->{StaticParams}};
+    if (defined $maxregCount)
+    {
+        push @paramData, ($maxregCount << 16) | 0x1b03;
+    }
+
+
+    my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets;
+    my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}};
+
+    if ($newCTAIDs ne $oldCTAIDs)
+    {
+        print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n";
+    }
+    if (@$ctaidOffsets)
+    {
+        push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04;
+        push @paramData, @$ctaidOffsets;
+    }
+
+    my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets;
+    my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}};
+
+    if ($newExits ne $oldExits)
+    {
+        print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n";
+    }
+    if (@$exitOffsets)
+    {
+        push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04;
+        push @paramData, @$exitOffsets;
+    }
+
+    if ($ctaidzUsed != $paramSec->{CTAIDZUsed})
+    {
+        print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n";
+    }
+    if ($ctaidzUsed)
+    {
+        push @paramData, 0x0401;
+    }
+
+    if (@{$paramSec->{REQNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004;
+        push @paramData, @{$paramSec->{REQNTID}};
+    }
+    if (@{$paramSec->{MAXNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504;
+        push @paramData, @{$paramSec->{MAXNTID}};
+    }
+    if (@$stackSize)
+    {
+        push @paramData, (scalar(@$stackSize) << 18) | 0x1e04;
+        push @paramData, @$stackSize;
+    }
+
+    my $newParamSize  = scalar(@paramData)*4;
+    $paramSec->{Data} = unpack "H*", pack "L*", @paramData;
+    if ($newParamSize != $paramSec->{size})
+    {
+        print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n";
+        $cubin->updateSize($paramSec, $newParamSize);
+    }
+
+    if ($newSize != $kernelSec->{size})
+    {
+        print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n";
+        $cubin->updateSize($kernelSec, $newSize, 1);
+    }
+}
+
+sub updateSize
+{
+    my ($cubin, $sec, $newSize, $updatePrgSize) = @_;
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    my $delta = $newSize - $sec->{size};
+    $sec->{size} = $newSize;
+
+    if ($sec->{SymbolEnt})
+    {
+        $sec->{SymbolEnt}{size} = $newSize;
+        my $symSection = $cubin->{'.symtab'};
+        $symSection->{Data} = '';
+        foreach my $symEnt (@{$symSection->{SymTab}})
+        {
+            $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}};
+        }
+    }
+
+    my $pos = $elfHdr->{ehSize};
+    my %sizeMap;
+
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        next if $secHdr->{align} == 0;
+
+        my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size};
+
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pos += $secHdr->{align} - $pad;
+        }
+        $sizeMap{$secHdr->{offset}} = $pos;
+
+        $secHdr->{offset} = $pos;
+
+        $pos += $size;
+    }
+
+    my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset};
+
+    $sizeMap{$elfHdr->{shOffset}} = $pos;
+    $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize;
+
+    $elfHdr->{shOffset} = $pos;
+    $elfHdr->{phOffset} = $pos + $shSize;
+
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}};
+
+        if ($updatePrgSize && $prgHdr->{type} == 1 &&
+            $sec->{offset} >= $prgHdr->{offset} &&
+            $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta)
+        {
+            $prgHdr->{fileSize} += $delta;
+            $prgHdr->{memSize}  += $delta;
+        }
+    }
+}
+
+sub write
+{
+    my ($cubin, $file) = @_;
+
+    open my $fh, ">$file" or die "Error: could not open $file for writing: $!";
+    binmode($fh);
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}};
+    my $pos = $elfHdr->{ehSize};
+
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        next if $secHdr->{size} == 0 || $secHdr->{type} == 8;
+
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pad = $secHdr->{align} - $pad;
+            print $fh join '', "\0" x $pad;
+            $pos += $pad;
+        }
+
+        print $fh pack 'H*', $secHdr->{Data};
+        $pos += $secHdr->{size};
+    }
+
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}};
+    }
+
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}};
+    }
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAs.pm b/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAs.pm
new file mode 100644
index 0000000..34dfbcd
--- /dev/null
+++ b/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAs.pm
@@ -0,0 +1,1196 @@
+package KeplerAs::KeplerAs;
+
+require 5.10.0;
+
+use strict;
+use Data::Dumper;
+use KeplerAs::KeplerAsGrammar;
+use File::Spec;
+use Carp;
+
+our $VERSION = '1.06';
+
+my %relOffset  = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT);
+
+my %absOffset  = map { $_ => 1 } qw(JCAL);
+
+my %jumpOp     = (%relOffset, %absOffset);
+
+my %noDest     = map { $_ => 1 } qw(ST STG STS STL RED);
+
+my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4);
+
+sub Assemble
+{
+    my ($file, $include, $doReuse, $nowarn) = @_;
+
+    my $regMap = {};
+    $file = Preprocess($file, $include, 0, $regMap);
+    my $vectors = delete $regMap->{__vectors};
+    my $regBank = delete $regMap->{__regbank};
+
+    my $regCnt = 0;
+    my $barCnt = 0;
+
+    my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse);
+
+    push @instructs, $ctrl = {};
+
+    foreach my $line (split "\n", $file)
+    {
+        $lineNum++;
+
+        next unless preProcessLine($line);
+
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+
+            push @branches, @instructs+0 if exists $jumpOp{$inst->{op}};
+
+            push @{$ctrl->{ctrl}}, $inst->{ctrl};
+
+            $inst->{ctrl} = $ctrl;
+
+            push @instructs, $inst;
+            push @instructs, $ctrl = {} if ((@instructs & 7) == 0);
+        }
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            $labels{$1} = @instructs+0;
+        }
+        else
+        {
+            die "badly formed line at $lineNum: $line\n";
+        }
+    }
+    push @{$ctrl->{ctrl}}, 0x00;
+    push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' };
+    while (@instructs & 7)
+    {
+        push @instructs, $ctrl = {} if ((@instructs & 7) == 0);
+        push @{$ctrl->{ctrl}}, 0x00;
+        push @instructs, { op => 'NOP', inst => 'NOP;' };
+    }
+
+    foreach my $i (@branches)
+    {
+        if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1})
+            { die "instruction has invalid label: $instructs[$i]{inst}"; }
+
+        $instructs[$i]{jump} = $labels{$1};
+
+        if (exists $relOffset{$instructs[$i]{op}})
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; }
+        else
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; }
+    }
+
+    foreach my $i (0 .. $#instructs)
+    {
+        next unless $i & 7;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            if ($doReuse)
+            {
+                my @r0 = getVecRegisters($vectors, $capData);
+
+
+                if (@r0 && !exists $noDest{$op})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        if (my $reuse = $reuse{$slot})
+                        {
+                            delete $reuse->{$_} foreach @r0;
+                        }
+                    }
+                }
+                %reuse = () if exists $jumpOp{$op};
+
+                if ($gram->{type}{reuse})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        next unless exists $capData->{$slot};
+
+                        my $r = $capData->{$slot};
+                        next if $r eq 'RZ';
+                        next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction
+
+                        my $reuse = $reuse{$slot} ||= {};
+
+                        if (my $p = $reuse->{$r})
+                        {
+                            $instructs[$p]{ctrl}{reuse}[($p & 7) - 1] |= $reuseSlots{$slot};
+
+                        }
+                        elsif (keys %$reuse > 2)
+                        {
+                            my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0];
+                            delete $reuse->{$oldest};
+                        }
+                        $reuse->{$r} = $i;
+                    }
+                }
+            }
+            elsif ($gram->{type}{reuse})
+            {
+                $ctrl->{reuse}[($i & 7) - 1] = genReuseCode($capData);
+            }
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    foreach my $r (sort keys %$regBank)
+    {
+        my $bank  = $regBank->{$r};
+        my $avail = $regMap->{$r};
+        foreach my $pos (0 .. $#$avail)
+        {
+            if ($bank == ($avail->[$pos] & 7))
+            {
+                $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+                last;
+            }
+        }
+    }
+
+    my (%liveTime, %pairedBanks, %reuseHistory);
+    foreach my $i (0 .. $#instructs)
+    {
+        next unless $i & 7;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $capData   = parseInstruct($inst, $gram) or next;
+            my $reuseType = $gram->{type}{reuse};
+
+            my (%addReuse, %delReuse);
+            foreach my $slot (qw(r8 r20 r39))
+            {
+                my $r = $capData->{$slot} or next;
+                next if $r eq 'RZ';
+
+                my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r};
+
+                if (my $liveTime = $liveTime{$liveR})
+                {
+                    push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                }
+                else
+                {
+                    warn "register used without initialization ($r): $inst\n" unless $nowarn;
+                    push @{$liveTime{$liveR}}, [$i,$i];
+                }
+
+                my $slotHist  = $reuseHistory{$slot} ||= {};
+                my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0;
+
+
+                if (!$selfReuse && ref $regMap->{$r})
+                {
+                    foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39))
+                    {
+                        my $r2 = $capData->{$slot2};
+                        next if $r2 eq 'RZ' || $r2 eq $r;
+
+                        my $slotHist2 = $reuseHistory{$slot2} ||= {};
+
+
+                        if (!$reuseType || !exists $slotHist2->{$r2})
+                        {
+                            if (ref $regMap->{$r2})
+                            {
+                                push @{$pairedBanks{$r}{pairs}}, $r2;
+                                $pairedBanks{$r}{banks} ||= [];
+                            }
+                            else
+                            {
+                                my $bank = substr($regMap->{$r2},1) & 7;
+
+                                $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++;
+                                $pairedBanks{$r}{pairs} ||= [];
+                            }
+                            $pairedBanks{$r}{useCnt}++;
+                        }
+                    }
+                }
+                if ($reuseType)
+                {
+                    if ($ctrl->{reuse}[($i & 7) - 1] & $reuseSlots{$slot})
+                        { $addReuse{$slot} = $r; }
+                    else
+                        { $delReuse{$slot} = $r; }
+                }
+            }
+            $reuseHistory{$_}{$addReuse{$_}} = 1    foreach keys %addReuse;
+            delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse;
+
+            foreach my $r0 (getVecRegisters($vectors, $capData))
+            {
+                my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0};
+
+                if (exists $noDest{$op})
+                {
+                    if (my $liveTime = $liveTime{$liveR})
+                    {
+                        push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                    }
+                    else
+                    {
+                        warn "register used without initialization ($r0): $inst\n" unless $nowarn;
+                        push @{$liveTime{$liveR}}, [$i,$i];
+                    }
+                }
+                elsif (my $liveTime = $liveTime{$liveR})
+                {
+                    if ($i > $liveTime->[$#$liveTime][1])
+                    {
+                        push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                    }
+                }
+                else
+                {
+                    push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                }
+            }
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    foreach my $r (sort {
+                    $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} ||
+                    $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} ||
+                    $a cmp $b
+                  } keys %pairedBanks)
+    {
+        my $banks = $pairedBanks{$r}{banks};
+        my $avail = $regMap->{$r};
+
+
+        BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..7))
+        {
+            foreach my $pos (0 .. $#$avail)
+            {
+                if ($bank == ($avail->[$pos] & 7))
+                {
+                    $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+
+                    $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}};
+                    last BANK;
+                }
+            }
+        }
+    }
+    foreach my $r (sort keys %$regMap)
+    {
+        if (ref($regMap->{$r}) eq 'ARRAY')
+        {
+            $regMap->{$r} = 'R' . shift @{$regMap->{$r}};
+        }
+    }
+
+    foreach my $i (0 .. $#instructs)
+    {
+        next unless $i & 7;
+
+        $instructs[$i]{orig} = $instructs[$i]{inst};
+        $instructs[$i]{inst} =~ s/(?<!\.)\b(\w+)\b(?!\[)/ exists($regMap->{$1}) ? $regMap->{$1} : $1 /ge;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            foreach my $r (qw(r0 r8 r20 r39))
+            {
+                next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ';
+
+                my $val = substr $capData->{$r}, 1;
+
+                my @r0 = getVecRegisters($vectors, $capData);
+                my @r8 = getAddrVecRegisters($vectors, $capData);
+
+                my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1;
+                my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1;
+
+                if ($val + $regInc > $regCnt)
+                {
+                    $regCnt = $val + $regInc;
+                }
+            }
+            if ($op eq 'BAR')
+            {
+                if (exists $capData->{i8w4})
+                {
+                    $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt;
+                }
+                elsif (exists $capData->{r8})
+                {
+                    $barCnt = 16;
+                }
+            }
+            my ($code, $reuse) = genCode($op, $gram, $capData);
+            $instructs[$i]{code} = $code;
+
+            if ($gram->{type}{reuse})
+                { $instructs[$i]{caps} = $capData; }
+            else
+                { $ctrl->{reuse}[($i & 7) - 1] = $reuse; }
+
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed);
+    foreach my $i (0 .. $#instructs)
+    {
+        if ($i & 7)
+        {
+            push @codes, $instructs[$i]{code};
+            my $code_dec= $instructs[$i]{code};
+            my $code_hex = sprintf("0x%x", $code_dec);
+
+            if ($instructs[$i]{caps})
+            {
+                registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 7) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn);
+            }
+            if ($instructs[$i]{inst} =~ m'EXIT')
+            {
+                push @exitOffsets, (scalar(@codes)-1)*8;
+            }
+            elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)')
+            {
+                push @ctaidOffsets, (scalar(@codes)-1)*8;
+                $ctaidzUsed = 1 if $1 eq 'Z';
+            }
+        }
+        else
+        {
+            my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)};
+            push @codes,
+                ($ctrl->[0] <<  2) | ($ctrl->[1] << 10) | ($ctrl->[2] << 18) | # ctrl codes
+                ($ctrl->[3] << 26) | ($ctrl->[4] << 34) | ($ctrl->[5] << 42) |
+                ($ctrl->[6] << 50) | (0x0800000000000000);  # reuse codes
+        }
+    }
+
+    return {
+        RegCnt       => $regCnt,
+        BarCnt       => $barCnt,
+        ExitOffsets  => \@exitOffsets,
+        CTAIDOffsets => \@ctaidOffsets,
+        CTAIDZUsed   => $ctaidzUsed,
+        ConflictCnt  => $reuseHistory{conflicts},
+        ReuseCnt     => $reuseHistory{reuse},
+        ReuseTot     => $reuseHistory{total},
+        ReusePct     => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0),
+        KernelData   => \@codes,
+    };
+}
+
+sub Test
+{
+    my ($fh, $printConflicts, $all) = @_;
+
+    my @instructs;
+    my %reuseHistory;
+    my ($pass, $fail) = (0,0);
+
+    while (my $line = <$fh>)
+    {
+        my (@ctrl, @reuse);
+
+        next unless processSassCtrlLine($line, \@ctrl, \@reuse);
+
+        foreach my $fileReuse (@reuse)
+        {
+            $line = <$fh>;
+
+            my $inst = processSassLine($line) or next;
+
+            $inst->{reuse} = $fileReuse;
+            my $fileCode = $inst->{code};
+
+            if (exists $relOffset{$inst->{op}})
+            {
+                $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e;
+            }
+
+            my $match = 0;
+            foreach my $gram (@{$grammar{$inst->{op}}})
+            {
+                my $capData = parseInstruct($inst->{inst}, $gram) or next;
+                my @caps;
+
+                my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps);
+
+                registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse};
+
+                $inst->{caps}      = join ', ', sort @caps;
+                $inst->{codeDiff}  = $fileCode  ^ $code;
+                $inst->{reuseDiff} = $fileReuse ^ $reuse;
+
+                if ($code == $fileCode && $reuse == $fileReuse)
+                {
+                    $inst->{grade} = 'PASS';
+                    push @instructs, $inst if $all;
+                    $pass++;
+                }
+                else
+                {
+                    $inst->{grade} = 'FAIL';
+                    push @instructs, $inst;
+                    $fail++;
+                }
+                $match = 1;
+                last;
+            }
+            unless ($match)
+            {
+                $inst->{grade}     = 'FAIL';
+                $inst->{codeDiff}  = $fileCode;
+                $inst->{reuseDiff} = $fileReuse;
+                push @instructs, $inst;
+                $fail++;
+            }
+        }
+    }
+    my %maxLen;
+    foreach (@instructs)
+    {
+        $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}};
+    }
+    my ($lastOp, $template);
+    foreach my $inst (sort {
+        $a->{op}        cmp $b->{op}        ||
+        $a->{codeDiff}  <=> $b->{codeDiff}  ||
+        $a->{reuseDiff} <=> $b->{reuseDiff} ||
+        $a->{ins}       cmp $b->{ins}
+        } @instructs)
+    {
+        if ($lastOp ne $inst->{op})
+        {
+            $lastOp   = $inst->{op};
+            $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s   %s\n";
+            printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s   %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures);
+        }
+        printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)};
+    }
+    my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0;
+
+    printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n",
+        $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total};
+
+    return $fail;
+}
+
+sub Extract
+{
+    my ($in, $out, $params) = @_;
+
+    my %paramMap;
+    my %constants =
+    (
+        blockDimX => 'c[0x0][0x28]',
+        blockDimY => 'c[0x0][0x2c]',
+        blockDimZ => 'c[0x0][0x30]',
+        gridDimX  => 'c[0x0][0x34]',
+        gridDimY  => 'c[0x0][0x38]',
+        gridDimZ  => 'c[0x0][0x3c]',
+    );
+    print $out "<CONSTANT_MAPPING>\n";
+
+    foreach my $const (sort keys %constants)
+    {
+        print $out "    $const : $constants{$const}\n";
+        $paramMap{$constants{$const}} = $const;
+    }
+    print $out "\n";
+
+    foreach my $p (@$params)
+    {
+        my ($ord,$offset,$size,$align) = split ':', $p;
+
+        if ($size > 4)
+        {
+            my $num = 0;
+            $offset = hex $offset;
+            while ($size > 0)
+            {
+                my $param = sprintf 'param_%d[%d]', $ord, $num;
+                my $const = sprintf 'c[0x0][0x%x]', $offset;
+                $paramMap{$const} = $param;
+                print $out "    $param : $const\n";
+                $size   -= 4;
+                $offset += 4;
+                $num    += 1;
+            }
+        }
+        else
+        {
+            my $param = sprintf 'param_%d', $ord;
+            my $const = sprintf 'c[0x0][%s]', $offset;
+            $paramMap{$const} = $param;
+            print $out "    $param : $const\n";
+        }
+    }
+    print $out "</CONSTANT_MAPPING>\n\n";
+
+    my %labels;
+    my $labelnum = 1;
+
+    my @data;
+    FILE: while (my $line = <$in>)
+    {
+        my (@ctrl, @ruse);
+        next unless processSassCtrlLine($line, \@ctrl, \@ruse);
+
+        CTRL: foreach my $ctrl (@ctrl)
+        {
+            $line = <$in>;
+
+            my $inst = processSassLine($line) or next CTRL;
+
+            if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)')
+            {
+                my $target = hex($1);
+
+                last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num}|| $target == $inst->{num}-8);
+
+                my $label = $labels{$target};
+                unless ($label)
+                {
+                    $label = $labels{$target} = "TARGET$labelnum";
+                    $labelnum++;
+                }
+                $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/;
+            }
+            $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg;
+
+            $inst->{ctrl} = printCtrl($ctrl);
+
+            push @data, $inst;
+        }
+    }
+    foreach my $inst (@data)
+    {
+        print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}};
+        printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)};
+    }
+}
+
+my $CommentRe  = qr'^[\t ]*<COMMENT>.*?^\s*</COMMENT>\n?'ms;
+my $IncludeRe  = qr'^[\t ]*<INCLUDE\s+file="([^"]+)"\s*/?>\n?'ms;
+my $CodeRe     = qr'^[\t ]*<CODE(\d*)>(.*?)^\s*<\/CODE\1>\n?'ms;
+my $ConstMapRe = qr'^[\t ]*<CONSTANT_MAPPING>(.*?)^\s*</CONSTANT_MAPPING>\n?'ms;
+my $RegMapRe   = qr'^[\t ]*<REGISTER_MAPPING>(.*?)^\s*</REGISTER_MAPPING>\n?'ms;
+my $ScheduleRe = qr'^[\t ]*<SCHEDULE_BLOCK>(.*?)^\s*</SCHEDULE_BLOCK>\n?'ms;
+my $InlineRe   = qr'\[(\+|\-)(.+?)\1\]'ms;
+
+sub IncludeFile
+{
+    my ($file, $include) = @_;
+    my ($vol,$dir,$name) = File::Spec->splitpath($file);
+    local $/;
+    my $fh;
+    if (!open $fh, $file)
+    {
+        open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n";
+    }
+    my $content = <$fh>;
+    close $fh;
+    return $content;
+}
+
+sub Preprocess
+{
+    my ($file, $include, $debug, $regMap) = @_;
+
+    my $constMap = {};
+    my $removeRegMap;
+    if ($regMap)
+        { $removeRegMap = 1; }
+    else
+        { $regMap = {}; }
+
+    1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg;
+
+    $file =~ s|$CommentRe||g;
+
+    1 while $file =~ s|$CodeRe|
+        my $out = eval "package KeplerAs::KeplerAs::CODE; $2";
+        $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg;
+
+    $file =~ s|$InlineRe|
+        my ($type, $code) = ($1, $2);
+        my $out = eval "package KeplerAs::KeplerAs::CODE; $code";
+        $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg;
+
+    $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg;
+
+    my @newFile;
+    foreach my $line (split "\n", $file)
+    {
+        if ($line !~ m'^\s*(?:#|//).*')
+        {
+            $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg;
+        }
+        push @newFile, $line;
+    }
+    $file = join "\n", @newFile;
+
+    $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg;
+
+    my @schedBlocks = $file =~ /$ScheduleRe/g;
+
+    foreach my $i (0 .. $#schedBlocks)
+    {
+        $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]);
+
+        $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug);
+    }
+
+    $file =~ s|$ScheduleRe| shift @schedBlocks |eg;
+
+    return $file;
+}
+
+my %srcReg   = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X);
+my %destReg  = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC);
+my %regops   = (%srcReg, %destReg);
+my @itypes   = qw(class lat rlat tput dual);
+
+sub Scheduler
+{
+    my ($block, $blockNum, $regMap, $debug) = @_;
+
+    my $vectors = $regMap->{__vectors};
+    my $lineNum = 0;
+
+    my (@instructs, @comments, $ordered, $first);
+    foreach my $line (split "\n", $block)
+    {
+        $lineNum++;
+
+        unless (preProcessLine($line))
+        {
+            push @comments, $line if $line =~ m'\S';
+            next;
+        }
+
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            $inst->{first}   = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1;
+
+            $inst->{exeTime} = 0;
+            $inst->{order}   = $ordered++ if $ordered;
+            push @instructs, $inst;
+        }
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n";
+        }
+        elsif ($line =~ m'^<ORDERED>')
+        {
+            die "you cannot use nested <ORDERED> tags" if $ordered;
+            $ordered = 1;
+        }
+        elsif ($line =~ m'^</ORDERED>')
+        {
+            die "missing opening <ORDERED> for closing </ORDERED> tag" if !$ordered;
+            $ordered = 0;
+        }
+        else
+        {
+            die "badly formed line at block: $blockNum line: $lineNum: $line\n";
+        }
+    }
+
+    my (%writes, %reads, @ready, @schedule, $orderedParent);
+    foreach my $instruct (@instructs)
+    {
+        my $match = 0;
+        foreach my $gram (@{$grammar{$instruct->{op}}})
+        {
+            my $capData = parseInstruct($instruct->{inst}, $gram) or next;
+            my (@dest, @src);
+
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+
+            push @src, $instruct->{predReg} if $instruct->{pred};
+
+            if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7})
+            {
+                my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src;
+                my $mask = hex($capData->{i20w7});
+                foreach my $p (0..6)
+                {
+                    if ($mask & (1 << $p))
+                    {
+                        push @$list, "P$p";
+                    }
+                    elsif ($instruct->{op} eq 'R2P')
+                    {
+                        push @src, "P$p";
+                    }
+                }
+            }
+
+            foreach my $operand (grep { exists $regops{$_} } sort keys %$capData)
+            {
+                my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src;
+
+                my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT';
+
+                if ($capData->{$operand} ne $badVal)
+                {
+                    push @$list,
+                        $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) :
+                        $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) :
+                        $operand eq 'CC' ? 'CC' :
+                        $operand eq 'X'  ? 'CC' :
+                        getRegNum($regMap, $capData->{$operand});
+                }
+            }
+            $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39});
+
+            foreach my $src (grep { exists $writes{$_} } @src)
+            {
+                my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat};
+
+                foreach my $parent (@{$writes{$src}})
+                {
+                    my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat};
+                    push @{$parent->{children}}, [$instruct, $latency - $regLatency];
+                    $instruct->{parents}++;
+
+                    last unless $parent->{pred};
+                }
+            }
+
+            foreach my $dest (grep { exists $reads{$_} } @dest)
+            {
+                foreach my $reader (@{$reads{$dest}})
+                {
+                    push @{$reader->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                delete $reads{$dest} unless $instruct->{pred};
+            }
+
+            if ($instruct->{order})
+            {
+                if ($orderedParent)
+                {
+                    push @{$orderedParent->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                $orderedParent = $instruct;
+            }
+            elsif ($orderedParent)
+                {  $orderedParent = 0; }
+
+            unshift @{$writes{$_}}, $instruct foreach @dest;
+
+            push @{$reads{$_}}, $instruct foreach @src;
+
+            push @ready, $instruct if !exists $instruct->{parents};
+
+            $match = 1;
+            last;
+        }
+        die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match;
+    }
+    %writes = ();
+    %reads  = ();
+
+    if (@ready)
+    {
+        my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" };
+
+        countUniqueDescendants($readyParent, {});
+        updateDepCounts($readyParent, {});
+
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    my $clock = 0;
+    while (my $instruct = shift @ready)
+    {
+        my $stall = $instruct->{stall};
+
+        if (@schedule && $stall < 16)
+        {
+            my $prev = $schedule[$#schedule];
+
+            $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0;
+            $prev->{ctrl} |= $stall;
+            $clock += $stall;
+        }
+        else
+        {
+            $instruct->{ctrl} &= 0x1fff0;
+            $instruct->{ctrl} |= 1;
+            $clock += 1;
+        }
+        print "$clock: $instruct->{inst}\n" if $debug;
+
+        push @schedule, $instruct;
+
+        if (my $children = $instruct->{children})
+        {
+            foreach (@$children)
+            {
+                my ($child, $latency) = @$_;
+
+                my $earliest = $clock + $latency;
+                $child->{exeTime} = $earliest if $child->{exeTime} < $earliest;
+
+                print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug;
+
+                push @ready, $child if --$child->{parents} < 1;
+            }
+            delete $instruct->{children};
+        }
+
+        foreach my $ready (@ready)
+        {
+            $stall = $ready->{exeTime} - $clock;
+            $stall = 1 if $stall < 1;
+
+            if ($ready->{class} eq $instruct->{class})
+            {
+                $stall = $ready->{tput} if $stall < $ready->{tput};
+            }
+            elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 &&
+                   $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const}))
+            {
+                $stall = 0;
+            }
+            $ready->{stall} = $stall;
+
+            $ready->{mix} = $ready->{class} ne $instruct->{class} || 0;
+        }
+
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $a->{stall}   <=> $b->{stall}  ||
+            $b->{mix}     <=> $a->{mix}    ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    my $out;
+    $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule;
+    return $out;
+}
+
+sub setConstMap
+{
+    my ($constMap, $constMapText) = @_;
+
+    foreach my $line (split "\n", $constMapText)
+    {
+        $line =~ s|^\s+||;
+        $line =~ s{(?:#|//).*}{};
+        $line =~ s|\s+$||;
+        next unless $line =~ m'\S';
+
+        my ($name, $value) = split '\s*:\s*', $line;
+
+        $constMap->{$name} = $value;
+    }
+    return;
+}
+
+sub setRegisterMap
+{
+    my ($regMap, $regmapText) = @_;
+
+    my $vectors = $regMap->{__vectors} ||= {};
+    my $regBank = $regMap->{__regbank} ||= {};
+    my %aliases;
+
+    foreach my $line (split "\n", $regmapText)
+    {
+        $line =~ s|^\s+||;
+        $line =~ s{(?:#|//).*}{};
+        $line =~ s|\s+$||;
+        next unless $line =~ m'\S';
+
+        my $auto  = $line =~ /~/;
+        my $share = $line =~ /=/;
+
+        my ($regNums, $regNames) = split '\s*[:~=]\s*', $line;
+
+        my (@numList, @nameList, %vecAliases);
+        foreach my $num (split '\s*,\s*', $regNums)
+        {
+            my ($start, $stop) = split '\s*\-\s*', $num;
+            die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop;
+            push @numList, ($start .. $stop||$start);
+        }
+        foreach my $fullName (split '\s*,\s*', $regNames)
+        {
+            if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$')
+            {
+                my ($name1, $name2, $bank) = ($1, $3, $4);
+                foreach (split '\s*\|\s*', $2)
+                {
+                    my ($start, $stop) = split '\s*\-\s*';
+                    foreach my $r (map "$name1$_$name2", $start .. $stop||$start)
+                    {
+                        $aliases{$r} = "$name1$name2" unless exists $aliases{$r};
+                        push @nameList, $r;
+                        $regBank->{$r} = $bank if $auto && defined $bank;
+                        warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank;
+                    }
+                }
+            }
+            elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$')
+            {
+                push @nameList, $1;
+                $regBank->{$1} = $2 if $auto && defined $2;
+                warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2;
+            }
+            else
+            {
+                die "Bad register name: '$fullName' at: $line\n";
+            }
+        }
+        die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList;
+        die "Missmatched register mapping at: $line\n" if $share && @numList > 1;
+
+        my $i = 0;
+        while ($i < $#numList-1)
+        {
+            last if $numList[$i] + 1 != $numList[$i+1];
+            $i++;
+        }
+        my $ascending = $i+1 == $#numList;
+
+        foreach my $n (0..$#nameList)
+        {
+            die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]};
+
+            if ($auto)
+            {
+                $regMap->{$nameList[$n]} = \@numList;
+            }
+            elsif ($share)
+            {
+                $regMap->{$nameList[$n]} = 'R' . $numList[0];
+            }
+            else
+            {
+                $regMap->{$nameList[$n]} = 'R' . $numList[$n];
+                if ($ascending && ($numList[$n] & 1) == 0)
+                {
+                    my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3);
+                    if ($end <= $#nameList)
+                    {
+                        $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ];
+                        if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}})
+                        {
+                            $regMap->{$aliases{$nameList[$n]}}  = $regMap->{$nameList[$n]};
+                            $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]};
+                            delete $aliases{$nameList[$n]};
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+sub preProcessLine
+{
+    $_[0] =~ s|^\s+||;
+
+    my $val = shift;
+
+    $val =~ s{(?:#|//).*}{};
+
+    return $val =~ m'\S';
+}
+
+sub countUniqueDescendants
+{
+    my ($node, $edges) = @_;
+
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+
+            $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges);
+        }
+    }
+    else
+    {
+        return $node->{lineNum};
+    }
+    return ($node->{lineNum}, keys %{$node->{deps}});
+}
+sub updateDepCounts
+{
+    my ($node, $edges) = @_;
+
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (@$children)
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+            updateDepCounts($child->[0], $edges);
+        }
+    }
+    $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0;
+}
+
+sub registerHealth
+{
+    my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_;
+
+    my (@banks, @conflicts);
+
+    foreach my $slot (qw(r8 r20 r39))
+    {
+        my $r = $capData->{$slot} or next;
+        next if $r eq 'RZ';
+
+        my $slotHist = $reuseHistory->{$slot} ||= {};
+
+        $reuseHistory->{total}++;
+
+        if (exists $slotHist->{$r})
+        {
+            $reuseHistory->{reuse}++;
+        }
+        else
+        {
+            my $bank = substr($r,1) & 7;
+
+            if ($banks[$bank] && $banks[$bank] ne $r)
+            {
+                push @conflicts, $banks[$bank] if !@conflicts;
+                push @conflicts, $r;
+
+                $reuseHistory->{conflicts}++;
+            }
+            $banks[$bank] = $r;
+        }
+
+        if ($reuseFlags & $reuseSlots{$slot})
+            { $slotHist->{$r} = 1; }
+        else
+            { delete $slotHist->{$r};  }
+    }
+    if ($inst && @conflicts && !$nowarn)
+    {
+        printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts);
+    }
+    return scalar @conflicts;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+KeplerAs::KeplerAs - Assembler for NVIDIA Maxwell architecture
+
+=head1 SYNOPSIS
+
+    KeplerAs.pl [opts]
+
+=head1 DESCRIPTION
+
+See the documentation at: https://github.com/NervanaSystems/KeplerAs
+
+=head1 SEE ALSO
+
+See the documentation at: https://github.com/NervanaSystems/KeplerAs
+
+
+=head1 AUTHOR
+
+Scott Gray, E<lt>sgray@nervanasys.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+=cut
diff --git a/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAsGrammar.pm b/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAsGrammar.pm
new file mode 100644
index 0000000..d372ea3
--- /dev/null
+++ b/Assembler/KeplerAs/blib/lib/KeplerAs/KeplerAsGrammar.pm
@@ -0,0 +1,1659 @@
+package KeplerAs::KeplerAsGrammar;
+
+use strict;
+use Carp;
+use Exporter;
+use Data::Dumper;
+our @ISA = qw(Exporter);
+
+our @EXPORT = qw(
+    %grammar %flags
+    parseInstruct genCode genReuseCode
+    processAsmLine processSassLine processSassCtrlLine
+    replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters
+);
+
+require 5.10.0;
+
+sub getI
+{
+    my ($orig, $pos, $mask) = @_;
+    my $val = $orig;
+    my $neg = $val =~ s|^\-||;
+
+    if ($val  =~ m'^(\d+)[xX]<([^>]+)>')
+    {
+        my $mul = $1;
+        my $exp = $2;
+        $exp =~ s/(?<!\d)0+(?=[1-9])//g;
+        my @globals = $exp =~ m'\$\w+'g;
+        my $our = @globals ? ' our (' . join(',',@globals) . ');' : '';
+        $val = $mul * eval "package KeplerAs::KeplerAs::CODE;$our $exp";
+    }
+    elsif ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+
+    if ( $neg )
+    {
+        $val = -$val;
+        $val &= $mask;
+    }
+    if (($val & $mask) != $val)
+    {
+        die sprintf "Immediate value out of range(0x%x): 0x%x ($orig)\n", $mask, $val;
+    }
+    return $val << $pos;
+}
+sub getF
+{
+    my ($val, $pos, $type, $trunc) = @_;
+    if ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    elsif ($val =~ m'INF'i)
+    {
+        $val = $trunc ? ($type eq 'f' ? 0x7f800 : 0x7ff00) : 0x7f800000;
+    }
+    else
+    {
+        $val = unpack(($type eq 'f' ? 'L' : 'Q'), pack $type, $val);
+
+        $val = ($val >> $trunc) & 0x7ffff if $trunc;
+    }
+    return $val << $pos;
+}
+sub getR
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^R(\d+|Z)$' && $1 < 255)
+    {
+        $val = $1 eq 'Z' ? 0xff : $1;
+    }
+    else
+    {
+        die "Bad register name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getP
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^P(\d|T)$' && $1 < 7)
+    {
+        $val = $1 eq 'T' ? 7 : $1;
+    }
+    else
+    {
+        die "Bad predicate name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getC { ((hex($_[0]) >> 2) & 0x3fff) << 23 }
+
+my %operands =
+(
+    p0      => sub { getP($_[0], 2)  },
+    p3      => sub { getP($_[0], 5)  },
+    p12     => sub { getP($_[0], 14) },
+    p29     => sub { getP($_[0], 32) },
+    p39     => sub { getP($_[0], 42) },
+    p45     => sub { getP($_[0], 48) },
+    p48     => sub { getP($_[0], 51) },
+    p58     => sub { getP($_[0], 58) },
+    r0      => sub { getR($_[0], 2)  },
+    r8      => sub { getR($_[0], 10)  },
+    r20     => sub { getR($_[0], 23) },
+    r28     => sub { getR($_[0], 28) },
+    r39s20  => sub { getR($_[0], 42) },
+    r39     => sub { getR($_[0], 42) },
+    r39a    => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20
+    c20     => sub { getC($_[0])     },
+    z20     => sub { getC($_[0])     },
+    c39     => sub { getC($_[0])     },
+    c34     => sub { hex($_[0]) << 37 },
+    c36     => sub { hex($_[0]) << 39 },
+    f20w32  => sub { getF($_[0], 23, 'f')        },
+    f20     => sub { getF($_[0], 23, 'f', 12)    },
+    d20     => sub { getF($_[0], 23, 'd', 44)    },
+    i8w4    => sub { getI($_[0], 10,  0xf)        },
+    i20     => sub { getI($_[0], 23, 0x7ffff)    },
+    i20w6   => sub { getI($_[0], 23, 0x3f)       },
+    i20w7   => sub { getI($_[0], 23, 0x7f)       },
+    i20w8   => sub { getI($_[0], 23, 0xff)       },
+    i20w12  => sub { getI($_[0], 23, 0xfff)      },
+    i20w24  => sub { getI($_[0], 23, 0xffffff)   },
+    i20w32  => sub { getI($_[0], 23, 0xffffffff) },
+    i31w4   => sub { getI($_[0], 34, 0xf)        },
+    i34w13  => sub { getI($_[0], 37, 0x1fff)     },
+    i36w20  => sub { getI($_[0], 36, 0xfffff)    },
+    i39w8   => sub { getI($_[0], 42, 0x1f)       },
+    i28w8   => sub { getI($_[0], 28, 0xff)       },
+    i28w20  => sub { getI($_[0], 31, 0xfffff)    },
+    i48w8   => sub { getI($_[0], 48, 0xff)       },
+    i51w5   => sub { getI($_[0], 51, 0x1f)       },
+    i53w5   => sub { getI($_[0], 53, 0x1f)       },
+    i23w6  => sub { getI($_[0], 23, 0x3f)      },
+);
+
+my $hex     = qr"0[xX][0-9a-fA-F]+";
+my $iAddr   = qr"\d+[xX]<[^>]+>";
+my $immed   = qr"$hex|$iAddr|\d+"o;
+my $reg     = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\
+my $p       = qr"P[0-6T]";
+my $noPred  = qr"(?<noPred>)";
+my $pred    = qr"\@(?<predNot>\!)?P(?<predNum>[0-6]) ";
+my $p0      = qr"(?<p0>$p)"o;
+my $p3      = qr"(?<p3>$p)"o;
+my $p12     = qr"(?<p12not>\!)?(?<p12>$p)"o;
+my $p29     = qr"(?<p29not>\!)?(?<p29>$p)"o;
+my $p39     = qr"(?<p39not>\!)?(?<p39>$p)"o;
+my $p45     = qr"(?<p45>$p)"o;
+my $p48     = qr"(?<p48>$p)"o;
+my $p58     = qr"(?<p58>$p)"o;
+my $r0      = qr"(?<r0>$reg)";
+my $r0cc    = qr"(?<r0>$reg)(?<CC>\.CC)?";
+my $r8      = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse1>\.reuse)?";
+my $r20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse2>\.reuse)?";
+my $r28     = qr"(?<r28>$reg)";
+my $r39s20  = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r39s20>(?<r20>$reg))\|?(?:\.(?<r39part>H0|H1))?(?<reuse2>\.reuse)?";
+my $r39     = qr"(?<r39neg>\-)?(?<r39>$reg)(?:\.(?<r39part>H0|H1))?(?<reuse3>\.reuse)?";
+my $r39a    = qr"(?<r39a>(?<r39>$reg))(?<reuse3>\.reuse)?";
+my $c20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3))?"o;
+my $c20x    = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20partx>H0|H1|B0|B1|B2|B3))?"o;
+my $c20s39  = qr"(?<r39neg>\-)?c\[(?<c34>$hex)\]\s*\[(?<c39>$hex)\]"o;
+my $f20w32  = qr"(?<f20w32>(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))";
+my $f20     = qr"(?<f20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $d20     = qr"(?<d20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $i8w4    = qr"(?<i8w4>$immed)"o;
+my $i20     = qr"(?<i20>(?<neg>\-)?$immed)(?<r20neg>\.NEG)?"o;
+my $i20w6   = qr"(?<i20w6>$immed)"o;
+my $i20w7   = qr"(?<i20w7>$immed)"o;
+my $i20w8   = qr"(?<i20w8>$immed)"o;
+my $i20w12  = qr"(?<i20w12>$immed)"o;
+my $i20w24  = qr"(?<i20w24>\-?$immed)"o;
+my $i20w32  = qr"(?<i20w32>\-?$immed)"o;
+my $i39w8   = qr"(?<i39w8>\-?$immed)"o;
+my $i28w8   = qr"(?<i28w8>$immed)"o;
+my $i28w20  = qr"(?<i28w20>\-?$immed)"o;
+my $i31w4   = qr"(?<i31w4>$immed)"o;
+my $i34w13  = qr"(?<i34w13>$immed)"o;
+my $i36w20  = qr"(?<i36w20>$immed)"o;
+my $i48w8   = qr"(?<i48w8>$immed)"o;
+my $i51w5   = qr"(?<i51w5>$immed)"o;
+my $i53w5   = qr"(?<i53w5>$immed)"o;
+my $i23w6   = qr"(?<i23w6>$immed)"o;
+my $ir20    = qr"$i20|$r20"o;
+my $cr20    = qr"$c20|$r20"o;
+my $icr20   = qr"$i20|$c20|$r20"o;
+my $fcr20   = qr"$f20|$c20|$r20"o;
+my $cr39    = qr"$c20s39|$r39"o;
+my $dr20    = qr"$d20|$r20"o;
+
+my $u32   = qr"(?<U32>\.U32)?";
+my $REV2B = qr"(?<REV2B>\.REV2B)?";
+my $W     = qr"(?<W>\.W)?";
+my $pnot2d= qr"(?<PNOT2D>\.PNOT2D)?";
+my $ftz   = qr"(?<FTZ>\.FTZ)?";
+my $sat   = qr"(?<SAT>\.SAT)?";
+my $rnd   = qr"(?:\.(?<rnd>RN|RM|RP|RZ))?";
+my $mulf  = qr"(?:\.(?<mulf>D2|D4|D8|M8|M4|M2))?";
+my $condition  = qr"(?:(?<CON>F|LT|EQ|LE|GT|NE|GE|NUM|NAN|LTU|EQU|LEU|GTU|NEU|GEU|OFF|LO|SFF|LS|HI|SFT|HS|OFT))?";
+my $lane2a= qr"(?:\.(?<lane2a>LNONE|L0|L1|L01|L2|L02|L12|L012|L3|L03|L13|L013|L23|L023|L123))?";
+my $lane0e= qr"(?:\.(?<lane0e>LNONE|L0|L1|L01|L2|L02|L12|L012|L3|L03|L13|L013|L23|L023|L123))?";
+
+
+my $round = qr"(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?";
+my $fcmp  = qr"(?<cmp>\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)";
+my $icmp  = qr"\.(?<cmp>LT|EQ|LE|GT|NE|GE)";
+my $bool  = qr"\.(?<bool>AND|OR|XOR|PASS_B)";
+my $bool2 = qr"\.(?<bool2>AND|OR|XOR)";
+my $func  = qr"\.(?<func>COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)";
+my $rro   = qr"\.(?<func>SINCOS|EX2)";
+my $add3  = qr"(?:\.(?<type>X|RS|LS))?";
+my $lopz  = qr"(?:\.(?<z>NZ|Z) $p48,|(?<noz>))"o;
+my $X     = qr"(?<X>\.X)?";
+my $PO     = qr"(?<PO>\.PO)?";
+my $bf     = qr"(?<BF>\.BF)?";
+my $S     = qr"(?<S>\.S)?";
+my $tld   = qr"(?<NODEP>NODEP\.)?(?:(?<reuse1>T)|(?<reuse2>P))";
+my $chnls = qr"(?<chnls>R|RGBA)";
+my $sr    = qr"SR_(?<sr>\S+)";
+my $shf   = qr"(?<W>\.W)?(?:\.(?<type>U64|S64))?(?<HI>\.HI)?";
+my $imad  = qr"(?:\.(?<type1>U32|S32))?(?:\.(?<type2>U32|S32))?(?:\.(?<mode>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $imadc = qr"(?:\.(?<type1>U32|S32))?(?:\.(?<type2>U32|S32))?(?:\.(?<modec>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $imul  = qr"(?:\.(?<type1>U32|S32))?(?:\.(?<type2>U32|S32))?";
+my $vmad8 = qr"\.(?<sign1>[SU])(?<size1>8|16)\.(?<sign2>[SU])(?<size2>8|16)(?<PO>\.PO)?(?<SHR_7>\.SHR_7)?(?<SHR_15>\.SHR_15)?(?<SAT>\.SAT)?";
+my $vmad16= qr"\.(?<sign1>[SU])(?<size1>16)\.(?<sign2>[SU])(?<size2>16)";
+my $hilo  = qr"(?:\.(?<mode>XHI|XLO))?";
+my $hi  = qr"(?:\.(?<mode>HI))?";
+my $vaddType = qr"(?:\.(?<UD>UD))?(?:\.(?<SD>SD))?(?:\.(?<sign1>[SU])(?<size1>8|16|32))?(?:\.(?<sign2>[SU])(?<size2>8|16|32))?";
+my $vaddMode = qr"(?:\.(?<mode>MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?";
+my $vmnmx = qr"(?:\.(?<MX>MX))?";
+my $x2x   = qr"\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)";
+my $prmt  = qr"(?:\.(?<mode>F4E|B4E|RC8|ECL|ECR|RC16))?";
+my $shfl  = qr"\.(?<mode>IDX|UP|DOWN|BFLY)";
+my $bar   = qr"\.(?<mode>SYNC|ARV|RED)(?:\.(?<red>POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?(<r20>)|(?<nor20>))(?(<red>), $p39|(?<nop39>))"o;
+my $b2r   = qr"\.RESULT $r0(?:, $p45|(?<nop45>))"o;
+my $dbar  = qr"(?<SB>SB0|SB1|SB2|SB3|SB4|SB5)";
+my $dbar2 = qr" {(?<db5>5)?,?(?<db4>4)?,?(?<db3>3)?,?(?<db2>2)?,?(?<db1>1)?,?(?<db0>0)?}";
+my $mbar  = qr"\.(?<mode>CTA|GL|SYS)";
+my $addr  = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i20w24)?\]"o;
+my $addr2 = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i28w20)?\]"o;
+my $ldc   = qr"c\[(?<c36>$hex)\]\s*$addr"o;
+my $atom  = qr"(?<E>\.E)?(?:\.(?<mode>ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?<type>|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)";
+my $vote  = qr"\.(?<mode>ALL|ANY|EQ)"o;
+my $memType  = qr"(?<type>\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)";
+my $memTypeX  = qr"(?<type>\.b32|\.b64|\.b96|\.b128)";
+my $memCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|CI|CS|CV|IL|WT|LU))?";
+my $ldmemCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|LU|CV))?";
+my $stmemCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|CS|WT))?";
+
+
+
+
+my $s2rT  = {class => 's2r',   lat => 2,   blat => 25,  rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $smemT = {class => 'mem',   lat => 2,   blat => 30,  rlat => 2, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $gmemT = {class => 'mem',   lat => 2,   blat => 200, rlat => 4, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $x32T  = {class => 'x32',   lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 1};
+my $x64T  = {class => 'x64',   lat => 2,   blat => 128, rlat => 0, rhold => 0,  tput => 128, dual => 0, reuse => 1};
+my $shftT = {class => 'shift', lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $cmpT  = {class => 'cmp',   lat => 13,  blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $qtrT  = {class => 'qtr',   lat => 8,   blat => 0,   rlat => 4, rhold => 0,  tput => 1,   dual => 1, reuse => 0};
+my $rroT  = {class => 'rro',   lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $voteT = {class => 'vote',  lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+
+
+our %grammar =
+(
+    FADD     => [
+    { type => $x32T,  code => 0xe2c0000000000002, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $cr20;"o,               },
+    { type => $x32T,  code => 0xc2c0000000000001, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $f20;"o,               },
+    ],
+    FADD32I  => [ { type => $x32T,  code => 0x4000000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FCHK     => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o,                     } ], #Partial?
+    FCMP     => [
+    { type => $cmpT,  code => 0xdd00000000000002, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $cr20, $r39;"o,            },
+    { type => $cmpT,  code => 0xdd00000000000002, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $r39s20, $c20s39;"o,            },
+    { type => $cmpT,  code => 0xb500000000000001, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $f20, $r39;"o,            },
+    ],
+    FFMA     => [
+                  { type => $x32T,  code => 0xcc00000000000002, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $cr20, $r39;"o,         },
+                  { type => $x32T,  code => 0xcc00000000000002, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o,     },
+                  { type => $x32T,  code => 0x9400000000000001, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $f20, $r39;"o,     },
+                ],
+    FMNMX    => [
+    { type => $shftT, code => 0xe300000000000002, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $cr20, $p39;"o,                },
+    { type => $shftT, code => 0xc300000000000001, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $f20, $p39;"o,                },
+    ],
+    FMUL     => [
+    { type => $x32T,  code => 0xe340000000000002, rule => qr"^$pred?FMUL$ftz$rnd$sat$mulf $r0, $r8, $cr20;"o,               },
+    { type => $x32T,  code => 0xc340000000000001, rule => qr"^$pred?FMUL$ftz$rnd$sat$mulf $r0, $r8, $f20;"o,               },
+    ],
+    FMUL32I  => [ { type => $x32T,  code => 0x2000000000000002, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FSET     => [
+    { type => $shftT, code => 0xc000000000000002, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $cr20, $p39;"o,       },
+    { type => $shftT, code => 0x8000000000000001, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $f20, $p39;"o,       },
+    ],
+    FSETP    => [ { type => $cmpT,  code => 0xdd80000000000002, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ],
+    MUFU     => [ { type => $qtrT,  code => 0x8400000000000002, rule => qr"^$pred?MUFU$func $r0, $r8;"o,                              } ],
+    RRO      => [ { type => $rroT,  code => 0xe480000000000002, rule => qr"^$pred?RRO$rro $r0, $r20;"o,                               } ],
+    DADD     => [
+    { type => $x64T,  code => 0xe380000000000002, rule => qr"^$pred?DADD$rnd $r0, $r8, $cr20;"o,                        },
+    { type => $x64T,  code => 0xc380000000000001, rule => qr"^$pred?DADD$rnd $r0, $r8, $d20;"o,                        },
+    ],
+    DFMA     => [
+    { type => $x64T,  code => 0xdb80000000000002, rule => qr"^$pred?DFMA$rnd $r0, $r8, $cr20, $r39;"o,                  },
+    { type => $x64T,  code => 0xdb80000000000002, rule => qr"^$pred?DFMA$rnd $r0, $r8, $d20, $r39;"o,                  },
+    ],
+    DMNMX    => [
+    { type => $cmpT,  code => 0xe280000000000002, rule => qr"^$pred?DMNMX $r0, $r8, $cr20, $p39;"o,                     },
+    { type => $cmpT,  code => 0xe280000000000002, rule => qr"^$pred?DMNMX $r0, $r8, $d20, $p39;"o,                     },
+    ],
+    DMUL     => [
+    { type => $x64T,  code => 0xe400000000000002, rule => qr"^$pred?DMUL$rnd $r0, $r8, $cr20;"o,                        },
+    { type => $x64T,  code => 0xc400000000000001, rule => qr"^$pred?DMUL$rnd $r0, $r8, $d20;"o,                        },
+    ],
+    DSET     => [ { type => $cmpT,  code => 0xc800000000000002, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o,            } ],
+    DSETP    => [ { type => $cmpT,  code => 0xdc00000000000002, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o,      } ],
+    FSWZADD  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o,                                    } ], #TODO
+
+    HADD2     => [ { type => $x32T,  code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o,               } ],
+    HMUL2     => [ { type => $x32T,  code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o,               } ],
+    HFMA2     => [ { type => $x32T,  code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o,         } ],
+    HSETP2    => [ { type => $cmpT,  code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial
+
+    BFE       => [
+    { type => $shftT,  code => 0xe008000000000002, rule => qr"^$pred?BFE$u32$REV2B $r0, $r8, $cr20;"o,                          },
+    { type => $shftT,  code => 0xc008000000000001, rule => qr"^$pred?BFE$u32$REV2B $r0, $r8, $ir20;"o,                          },
+    ],
+    BFI       => [
+    { type => $shftT,  code => 0xdf80000000000002, rule => qr"^$pred?BFI$S $r0, $r8, $r20, $cr39;"o,                        },
+    { type => $shftT,  code => 0xb780000000000001, rule => qr"^$pred?BFI$S $r0, $r8, $i20, $cr39;"o,                        },
+    ],
+    FLO       => [ { type => $s2rT,   code => 0xe180000000000002, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o,                              } ],
+    IADD      => [
+    { type => $x32T,   code => 0xe080000000000002, rule => qr"^$pred?IADD$S$PO$sat$X $r0cc, $r8, $cr20;"o,                         },
+    { type => $x32T,   code => 0xc080000000000001, rule => qr"^$pred?IADD$S$PO$sat$X $r0cc, $r8, $i20;"o,                         },
+    ],
+
+    ISUB      => [
+    { type => $x32T,   code => 0xe088000000000002, rule => qr"^$pred?ISUB$sat$X $r0cc, $r8, $cr20;"o,                         },
+    { type => $x32T,   code => 0xc088000000000001, rule => qr"^$pred?ISUB$sat$X $r0cc, $r8, $i20;"o,                         },
+    { type => $x32T,   code => 0xc090000000000001, rule => qr"^$pred?ISUB$sat$X $r0cc, $i20, $r8;"o,                         },
+    ],
+
+
+
+    IADD32I   => [ { type => $x32T,   code => 0x4000000000000001, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o,                         } ],
+    ICMP      => [
+    { type => $cmpT,   code => 0xda08000000000002, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $cr20, $r39;"o,              },
+    { type => $cmpT,   code => 0xda08000000000002, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $r39s20, $c20s39;"o,              },
+    { type => $cmpT,   code => 0xb208000000000001, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $i20, $r39;"o,              },
+    ],
+    IMNMX     => [
+    { type => $shftT,  code => 0xe108000000000002, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $cr20, $p39;"o,                  },
+    { type => $shftT,  code => 0xc108000000000001, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $i20, $p39;"o,                  },
+    ],
+    ISET      => [
+    { type => $shftT,  code => 0xda88000000000002, rule => qr"^$pred?ISET$bf$icmp$u32$X$bool$S $r0, $r8, $cr20, $p39;"o,       },
+    { type => $shftT,  code => 0xb288000000000001, rule => qr"^$pred?ISET$bf$icmp$u32$X$bool$S $r0, $r8, $i20, $p39;"o,       },
+    ],
+    ISETP     => [
+    { type => $cmpT,   code => 0xdb08000000000002, rule => qr"^$pred?ISETP$icmp$u32$X$bool$S $p3, $p0, $r8, $cr20, $p39;"o, },
+    { type => $cmpT,   code => 0xb308000000000001, rule => qr"^$pred?ISETP$icmp$u32$X$bool$S $p3, $p0, $r8, $i20, $p39;"o, },
+   ],
+    ISCADD    => [
+    { type => $shftT,  code => 0xe0c0000000000002, rule => qr"^$pred?ISCADD$X $r0cc, $r8, $cr20, $i39w8;"o,                   },
+    { type => $shftT,  code => 0xc0c0000000000001, rule => qr"^$pred?ISCADD$X $r0cc, $r8, $i20, $i39w8;"o,                   }
+    ],
+    ISCADD32I => [ { type => $shftT,  code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o,               } ],
+
+    LOP       => [
+    { type => $x32T,   code => 0xe200000000000002, rule => qr"^$pred?LOP$bool$S $r0, (?<INV1>~)?$r8, (?<INV>~)?$cr20(?<INV>\.INV)?;"o, },
+    { type => $x32T,   code => 0xc200000000000001, rule => qr"^$pred?LOP$bool$S $r0, (?<INV1>~)?$r8, (?<INV>~)?$i20(?<INV>\.INV)?;"o, },
+    ],
+    LOP32I    => [ { type => $x32T,   code => 0x2000000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o,                     } ],
+    LOP3      => [
+                   { type => $x32T,   code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o,            },
+                   { type => $x32T,   code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o,            },
+                 ],
+    POPC      => [
+    { type => $s2rT,   code => 0xe040000000000002, rule => qr"^$pred?POPC $r0, $r8, $cr20;"o,                                    },
+    { type => $s2rT,   code => 0xc040000000000001, rule => qr"^$pred?POPC $r0, $r8, $i20;"o,                                    },
+    ],
+    SHF       => [
+                   { type => $shftT,  code => 0xdfc0000000000002, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $r20, $r39;"o,                  },
+                   { type => $shftT,  code => 0xb7c0000000000001, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $i20, $r39;"o,                  },
+                   { type => $shftT,  code => 0xe7c0000000000002, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $r20, $r39;"o,                  },
+                   { type => $shftT,  code => 0xc7c0000000000001, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $i20, $r39;"o,                  },
+                 ],
+    SHL       => [
+    { type => $shftT,  code => 0xe240000000000002, rule => qr"^$pred?SHL(?<W>\.W)? $r0, $r8, $cr20;"o,                    },
+    { type => $shftT,  code => 0xc240000000000001, rule => qr"^$pred?SHL(?<W>\.W)? $r0, $r8, $i23w6;"o,                    },
+    ],
+    SHR       => [
+    { type => $shftT,  code => 0xe148000000000002, rule => qr"^$pred?SHR$u32$W $r0, $r8, $cr20;"o,                          },
+    { type => $shftT,  code => 0xc148000000000001, rule => qr"^$pred?SHR$u32$W $r0, $r8, $i23w6;"o,                          },
+   ],
+IMAD      => [
+                   { type => $x32T,   code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $r20, $r39;"o,                 },
+                   { type => $x32T,   code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $r39s20, $c20s39;"o,            },
+                   { type => $x32T,   code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $c20x, $r39;"o,                  },
+                   { type => $x32T,   code => 0xa108000000000001, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $i20, $r39;"o,                  },
+                 ],
+    IMADSP    => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO
+    IMUL      => [
+    { type => $x32T,   code => 0xe1c0180000000002, rule => qr"^$pred?IMUL$imul$hi $r0, $r8, $cr20;"o,   },
+    { type => $x32T,   code => 0xc1c0180000000001, rule => qr"^$pred?IMUL$imul$hi $r0, $r8, $i20;"o,   },
+    ],
+    IMUL32I      => [
+    { type => $x32T,   code => 0x2e00000000000002, rule => qr"^$pred?IMUL32I$imul$hi $r0, $r8, $i20w32;"o,   },
+    ],
+
+    F2F => [ { type => $qtrT,  code => 0xe540000000000002, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ],
+    F2I => [ { type => $qtrT,  code => 0xe580000000000002, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o,         } ],
+    I2F => [ { type => $qtrT,  code => 0xe5c0000000000002, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o,               } ],
+    I2I => [ { type => $qtrT,  code => 0xe600000000000002, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o,               } ],
+    F2ITRUNC => [ { type => $qtrT,  code => 0xe5800c00051ca846, rule => qr"^$pred?F2ITRUNC[^;]*;"o,               } ],
+
+    MOV    => [ { type => $x32T,  code => 0xe4c03c0000000002, rule => qr"^$pred?MOV$lane2a$S $r0, $cr20;"o,                   } ],
+    MOV32I => [ { type => $x32T,  code => 0x740000000003c002, rule => qr"^$pred?MOV32I$lane0e$S $r0, (?:$i20w32|$f20w32);"o,   } ],
+    PRMT   => [
+    { type => $x32T,  code => 0xde00000000000002, rule => qr"^$pred?PRMT$prmt $r0, $r8, $cr20, $cr39;"o, },
+    { type => $x32T,  code => 0xb600000000000001, rule => qr"^$pred?PRMT$prmt $r0, $r8, $i20, $r39;"o, },
+    ],
+    SEL    => [
+    { type => $x32T,  code => 0xe500000000000002, rule => qr"^$pred?SEL $r0, $r8, $cr20, $p39;"o,        },
+    { type => $x32T,  code => 0xc500000000000001, rule => qr"^$pred?SEL $r0, $r8, $i20, $p39;"o,        },
+    ],
+    SHFL   => [ { type => $smemT, code => 0x7880000000000002, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ],
+
+    PSET   => [ { type => $cmpT,  code => 0x8440000000000002, rule => qr"^$pred?PSET$bf$bool2$bool $r0, $p12, $p29, $p39;"o,       } ],
+    PSETP  => [ { type => $cmpT,  code => 0x8480000000000002, rule => qr"^$pred?PSETP$bool2$bool$S $p3, $p0, $p12, $p29, $p39;"o, } ],
+    CSET   => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o,  } ], #TODO
+    CSETP  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO
+    P2R    => [ { type => $x32T,  code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o,   } ],
+    R2P    => [ { type => $cmpT,  code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o,   } ],
+
+    TLD    => [ { type => $gmemT, code => 0x700a00067f9ffc02, rule => qr"^$pred?TLD[^;]*;"o, } ], #Partial
+    TLDzxx    => [ { type => $gmemT, code => 0x700a00057f9ffc02, rule => qr"^$pred?TLDzxx[^;]*;"o, } ], #Partial
+    TEXDEPBAR    => [ { type => $gmemT, code => 0x77000000001c0002, rule => qr"^$pred?TEXDEPBAR $i20w6;"o, } ], #Partial
+    TEX    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o,   } ], #TODO
+    TLD4   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o,  } ], #TODO
+    TXQ    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o,   } ], #TODO
+
+    LD     => [ { type => $gmemT, code => 0xc000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr;"o,      } ],
+    LDY     => [ { type => $gmemT, code => 0x7f80000000000002, rule => qr"^$pred?LDY $r0, $i20;"o,      } ],
+    LDX     => [ { type => $gmemT, code => 0x7ec0000000000002, rule => qr"^$pred?LDX$memTypeX $r0, $addr;"o,      } ],
+    ST     => [ { type => $gmemT, code => 0xe000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0;"o,      } ],
+    LDG    => [
+    { type => $gmemT, code => 0x600010047f800001, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o,           },
+    ],
+    LDS    => [ { type => $smemT, code => 0x7a40000000000002, rule => qr"^$pred?LDS$memCache$memType$S $r0, $addr;"o,           } ],
+    STS    => [ { type => $smemT, code => 0x7ac0000000000002, rule => qr"^$pred?STS$memCache$memType$S $addr, $r0;"o,           } ],
+    LDL    => [ { type => $gmemT, code => 0x7a00000000000002, rule => qr"^$pred?LDL$ldmemCache$memType$S $r0, $addr;"o,           } ],
+    STL    => [ { type => $gmemT, code => 0x7a80000000000002, rule => qr"^$pred?STL$stmemCache$memType$S $addr, $r0;"o,           } ],
+    LDC    => [ { type => $gmemT, code => 0x7c800000000ffc02, rule => qr"^$pred?LDC$memCache$memType$S $r0, $ldc;"o,            } ],
+    ATOM   => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o,   } ],
+    RED    => [ { type => $gmemT, code => 0x68000000000003fe, rule => qr"^$pred?RED$atom $addr2, $r20;"o,                      } ],
+    CCTL   => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o,  } ], #TODO
+    CCTLL  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO
+
+    SULD   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o,   } ], #TODO
+
+    BRA    => [
+                { type => $x32T, code => 0x120000000000003c, rule => qr"^$pred?BRA(?<U>\.U)? $i20w24;"o,         },
+                { type => $x32T, code => 0x1200000000000000, rule => qr"^$pred?BRA(?<U>\.U)? CC\.$condition, $i20w24;"o,         },
+              ],
+
+    BRX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o,                      } ], #TODO
+    JMP    => [
+    { type => $x32T, code => 0x108000000000003c, rule => qr"^$pred?JMP(?<U>\.U)? $i20w32;"o,         },
+    { type => $x32T, code => 0x1080000000000000, rule => qr"^$pred?JMP(?<U>\.U)? CC\.$condition, $i20w32;"o,         },
+    ],
+    JMX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o,                      } ], #TODO
+    SSY    => [ { type => $x32T, code => 0x1480000000000000, rule => qr"^$noPred?SSY $i20w24;"o,                 } ],
+
+    CAL    => [ { type => $x32T, code => 0x1300000000000100, rule => qr"^$noPred?CAL $i20w24;"o,                 } ],
+    JCAL   => [ { type => $x32T, code => 0x1100000000000100, rule => qr"^$noPred?JCAL $i20w32;"o,                } ],
+    PRET   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o,                     } ], #TODO
+    RET    => [
+    { type => $x32T, code => 0x190000000000003c, rule => qr"^$pred?RET;"o,                           },
+    { type => $x32T, code => 0x1900000000000000, rule => qr"^$pred?RET CC\.$condition;"o,                           },
+    ],
+    BRK    => [ { type => $x32T, code => 0x1a0000000000003c, rule => qr"^$pred?BRK;"o,                           } ],
+    PBK    => [ { type => $x32T, code => 0x1500000000000000, rule => qr"^$noPred?PBK $i20w24;"o,                 } ],
+    CONT   => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o,                          } ],
+    PCNT   => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o,                } ],
+    EXIT   => [
+    { type => $x32T, code => 0x18000000001c003c, rule => qr"^$pred?EXIT;"o,                          },
+    { type => $x32T, code => 0x18000000001c0000, rule => qr"^$pred?EXIT CC\.$condition;"o,                          },
+    ],
+    BPT    => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o,           } ],
+
+    NOP    => [ { type => $x32T,  code => 0x8580000000003c02, rule => qr"^$pred?NOP$S;"o,                                     } ],
+    S2R    => [ { type => $s2rT,  code => 0x8640000000000002, rule => qr"^$pred?S2R$S $r0, $sr;"o,                            } ],
+    B2R    => [ { type => $x32T,  code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o,                                 } ],
+    BAR    => [
+    { type => $gmemT, code => 0x8540dc0000000002, rule => qr"^$pred?BAR.SYNC $i8w4;"o,                                 },
+    { type => $gmemT, code => 0x8540dc0000000002, rule => qr"^$pred?BAR.SYNC $i8w4, $i20w12;"o,                                 },
+    { type => $gmemT, code => 0x85409c0000000002, rule => qr"^$pred?BAR.SYNC $i8w4, $r20;"o,                                 },
+    { type => $gmemT, code => 0x85405c0000000002, rule => qr"^$pred?BAR.SYNC $r8;"o,                                 },
+    { type => $gmemT, code => 0x85405c0000000002, rule => qr"^$pred?BAR.SYNC $r8, $i20w12;"o,                                 },
+    { type => $gmemT, code => 0x85401c0000000002, rule => qr"^$pred?BAR.SYNC $r8, $r20;"o,                                 },
+    { type => $gmemT, code => 0x8540dc0800000002, rule => qr"^$pred?BAR.ARV $i8w4, $i20w12;"o,                                 },
+    { type => $gmemT, code => 0x85409c0800000002, rule => qr"^$pred?BAR.ARV $i8w4, $r20;"o,                                 },
+    { type => $gmemT, code => 0x85405c0800000002, rule => qr"^$pred?BAR.ARV $r8, $i20w12;"o,                                 },
+    { type => $gmemT, code => 0x85401c0800000002, rule => qr"^$pred?BAR.ARV $r8, $r20;"o,                                 },
+    ],
+    DEPBAR => [
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, },
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o,              },
+              ],
+    MEMBAR => [ { type => $x32T,  code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o,                             } ],
+
+    VOTE   => [
+    { type => $voteT, code => 0x86c0000000000002, rule => qr"^$pred?VOTE$vote (?:$r0, |(?<nor0>))$p45, $p39;"o, } ],
+
+
+    VADD   => [   { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMAD   => [
+                  { type => $x32T,  code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $i20, $r39;"o, },
+                  { type => $x32T,  code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, },
+                  { type => $shftT, code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, },
+              ],
+    VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMNMX    => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+
+    VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+);
+
+my @flags = grep /\S/, split "\n", q{;
+
+BFE, BFI, FLO, IADD, ISUB, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD
+0x0800000000000000 neg
+
+FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP
+0x0800000000000000 neg
+
+PSET, PSETP
+0x0000000000020000 p12not
+0x0000000800000000 p29not
+
+FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE
+0x0000200000000000 p39not
+
+IADD32I
+0x0010000000000000 CC
+
+IMAD, PSET, FSET, DSET, ISET, IADD, ISUB, IMUL, ISCADD
+0x0004000000000000 CC
+
+IMAD: mode
+0x0200000000000000 HI
+
+IMAD
+0x0010000000000000 X
+
+IMUL: mode
+0x0000040000000000 HI
+
+IMUL32I: mode
+0x0100000000000000 HI
+
+FFMA, FADD, FCMP, FMUL, FMNMX,  FSWZ, FSET, FSETP,  FCHK, RRO,  MUFU, DFMA, DADD, DMUL, DMNMX,  DSET, DSETP,  IMAD, IMADSP, IMUL, IADD, ISCADD, ISAD, IMNMX,  BFE,  BFI,  SHR,  SHL,  SHF,  LOP,  FLO,  ISET, ISETP,  ICMP, POPC, F2F,  F2I,  I2F,  I2I,  MOV, MOV32I, SEL,  PRMT, SHFL, P2R,  R2P,  CSET, CSETP,  PSET, PSETP,  TEX,  TLD,  TLD4, TXQ,  LDC,  LD, LDG,  LDL,  LDS,  LDSLK,  ST, STL,  STS,  STSCUL, ATOM, RED,  CCTL, CCTLL,  MEMBAR, SUCLAMP,  SUBFM,  SUEAU,  SULDGA, SUSTGA, BRA,  BRX,  RET,  BRK,  CONT, NOP,  S2R,  B2R,  BAR,  VOTE, MOV
+0x0000000000400000 S
+
+SHF
+0x0020000000000000 W
+0x0001000000000000 HI
+
+SHF: type
+0x0000020000000000 U64
+0x0000010000000000 S64
+
+IMAD, ICMP, ISET, ISETP, ISAD, SHR, IMNMX, FLO, BFE
+0x0008000000000000 U32
+
+SHR, SHL
+0x0000040000000000 W
+
+SHFL
+0x0000000080000000 i20w8
+0x0000000100000000 i34w13
+
+SHFL: mode
+0x0000000000000000 IDX
+0x0000000200000000 UP
+0x0000000300000000 DOWN
+0x0000000600000000 BFLY
+
+IMNMX: mode
+0x0000080000000000 XLO
+0x0000180000000000 XHI
+
+ISETP, ISET, ICMP: cmp
+0x0010000000000000 LT
+0x0020000000000000 EQ
+0x0030000000000000 LE
+0x0040000000000000 GT
+0x0050000000000000 NE
+0x0060000000000000 GE
+
+ISETP, ISET, PSETP, PSET, FSET, FSETP, DSET, DSETP: bool
+0x0000000000000000 AND
+0x0001000000000000 OR
+0x0002000000000000 XOR
+
+PSETP, PSET: bool2
+0x0000000000000000 AND
+0x0000000008000000 OR
+0x0000000010000000 XOR
+
+ISETP, ISET, IADD, ISUB
+0x0000400000000000 X
+
+ISCADD
+0x0020000000000000 X
+
+ISET, PSET
+0x0000800000000000 BF
+
+LOP: bool
+0x0000000000000000 AND
+0x0000100000000000 OR
+0x0000200000000000 XOR
+0x0000300000000000 PASS_B
+
+LOP, POPC, FLO
+0x0000080000000000 INV
+
+LOP, POPC, IADD, ISUB
+0x0000040000000000 INV1
+
+LOP: z
+0x0000200000000000 Z
+0x0000300000000000 NZ
+
+LOP
+0x0000000000000000 noz
+
+LOP32I: bool
+0x0000000000000000 AND
+0x0020000000000000 OR
+0x0040000000000000 XOR
+
+PRMT: mode
+0x0008000000000000 F4E
+0x0010000000000000 B4E
+0x0018000000000000 RC8
+0x0020000000000000 ECL
+0x0028000000000000 ECR
+0x0030000000000000 RC16
+
+IMAD: type1
+0x0008000000000000 U32
+0x0008000000000000 S32
+
+IMAD: type2
+0x0100000000000000 U32
+0x0100000000000000 S32
+
+IMUL: type1
+0x0000080000000000 U32
+0x0000000000000000 S32
+
+IMUL: type2
+0x0000100000000000 U32
+0x0000000000000000 S32
+
+IMUL32I: type1
+0x0200000000000000 U32
+0x0000000000000000 S32
+
+IMUL32I: type2
+0x0400000000000000 U32
+0x0000000000000000 S32
+
+XMAD: type1
+0x0000000000000000 U16
+0x0001000000000000 S16
+
+XMAD: type2
+0x0000000000000000 U16
+0x0002000000000000 S16
+
+XMAD: mode
+0x0000002000000000 MRG
+0x0000001000000000 PSL
+0x0008000000000000 CHI
+0x0004000000000000 CLO
+0x000c000000000000 CSFU
+
+XMAD: modec
+0x0004000000000000 CLO
+0x0008000000000000 CHI
+0x000c000000000000 CSFU
+0x0040000000000000 X
+0x0080000000000000 PSL
+0x0100000000000000 MRG
+
+XMAD
+0x0010000000000000 CBCC
+
+XMAD: r8part
+0x0000000000000000 H0
+0x0020000000000000 H1
+
+XMAD: r20part
+0x0000000000000000 H0
+0x0000000800000000 H1
+
+XMAD: r20partx
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+XMAD: r39part
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part
+0x0000000000000000 B0
+0x0000001000000000 B1
+0x0000002000000000 B2
+0x0000003000000000 B3
+0x0000001000000000 H1
+0x0000000000000000 H0
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part
+0x0000000000000000 B0
+0x0000000010000000 B1
+0x0000000020000000 B2
+0x0000000030000000 B3
+0x0000000010000000 H1
+0x0000000000000000 H0
+
+VMAD
+0x0040000000000000 r8neg
+0x0020000000000000 r39neg
+0x0008000000000000 SHR_7
+0x0010000000000000 SHR_15
+0x0060000000000000 PO
+0x0080000000000000 SAT
+
+VMNMX
+0x0100000000000000 MX
+
+VADD, VABSDIFF, VMNMX
+0x0080000000000000 SAT
+0x0040000000000000 UD
+0x0040000000000000 SD
+
+VSET: cmp
+0x0040000000000000 LT
+0x0080000000000000 EQ
+0x00c0000000000000 LE
+0x0100000000000000 GT
+0x0140000000000000 NE
+0x0180000000000000 GE
+
+VADD, VSET: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VABSDIFF: mode
+0x0003000000000000 ACC
+0x000b000000000000 MIN
+0x0013000000000000 MAX
+0x0023000000000000 MRG_16H
+0x002b000000000000 MRG_16L
+0x0033000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x003b000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMNMX: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1
+0x0000000000000000 U
+0x0004000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2
+0x0000000000000000 U
+0x0008000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size1
+0x0000000000000000 8
+0x0000004000000000 16
+0x0000006000000000 32
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size2
+0x0000000000000000 8
+0x0000000000000000 16
+0x0000000000000000 32
+
+IADD3: type
+0x0001000000000000 X
+0x0000002000000000 RS
+0x0000004000000000 LS
+
+IADD3: r8part
+0x0000000000000000 H0
+0x0000001000000000 H1
+
+IADD3: r20part
+0x0000000080000000 H0
+
+IADD3: r39part
+0x0000000200000000 H0
+
+IADD3
+0x0008000000000000 r8neg
+0x0004000000000000 r20neg
+0x0002000000000000 r39neg
+
+IADD, ISUB, ISCADD
+0x0010000000000000 r8neg
+0x0008000000000000 r20neg
+0x0018000000000000 PO
+
+IADD32I
+0x0100000000000000 X
+0x0800000000000000 r8neg
+
+IMAD
+0x0080000000000000 r8neg
+
+IMAD
+0x0040000000000000 r39neg
+
+DEPBAR: SB
+0x0000000000000000 SB0
+0x0000000004000000 SB1
+0x0000000008000000 SB2
+0x000000000c000000 SB3
+0x0000000010000000 SB4
+0x0000000014000000 SB5
+
+DEPBAR: cmp
+0x0000000020000000 LE
+
+DEPBAR
+0x0000000000000001 db0
+0x0000000000000002 db1
+0x0000000000000004 db2
+0x0000000000000008 db3
+0x0000000000000010 db4
+0x0000000000000020 db5
+
+F2F, F2I, I2F, I2I: destWidth
+0x0000000000000000 8
+0x0000000000000400 16
+0x0000000000000800 32
+0x0000000000000c00 64
+
+F2F, F2I, I2F, I2I: srcWidth
+0x0000000000000000 8
+0x0000000000001000 16
+0x0000000000002000 32
+0x0000000000003000 64
+
+F2F, F2I, I2F, I2I: destSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000008000 S
+
+F2F, F2I, I2F, I2I: srcSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000008000 S
+
+F2I, I2F, I2I: r20part
+0x0000000000000000 H0
+0x0000040000000000 H1
+0x0000000000000000 B0
+0x0000020000000000 B1
+0x0000040000000000 B2
+0x0000060000000000 B3
+
+F2F: r20part
+0x0000000000000000 H0
+0x0000020000000000 H1
+
+F2F: round
+0x0000040000000000 ROUND
+0x0000048000000000 FLOOR
+0x0000050000000000 CEIL
+0x0000058000000000 TRUNC
+
+F2I: round
+0x0000000000000000 ROUND
+0x0000040000000000 FLOOR
+0x0000080000000000 CEIL
+0x00000c0000000000 TRUNC
+
+HADD2, HMUL2: r8part
+0x0001000000000000 H0_H0
+0x0000000000000000 H1_H1
+
+HFMA2: r20part
+0x0000000020000000 H0_H0
+0x0000000030000000 H1_H1
+
+FADD, DADD, FMUL, DMUL, F2F, I2F: rnd
+0x0000000000000000 RN
+0x0000040000000000 RM
+0x0000080000000000 RP
+0x00000c0000000000 RZ
+
+FMUL: mulf
+0x0000100000000000 D2
+0x0000200000000000 D4
+0x0000300000000000 D8
+0x0000400000000000 M8
+0x0000500000000000 M4
+0x0000600000000000 M2
+
+BRA, JMP, RET, EXIT: CON
+0x0000000000000000 F
+0x0000000000000004 LT
+0x0000000000000008 EQ
+0x000000000000000c LE
+0x0000000000000010 GT
+0x0000000000000014 NE
+0x0000000000000018 GE
+0x000000000000001c NUM
+0x0000000000000020 NAN
+0x0000000000000024 LTU
+0x0000000000000028 EQU
+0x000000000000002c LEU
+0x0000000000000030 GTU
+0x0000000000000034 NEU
+0x0000000000000038 GEU
+0x0000000000000040 OFF
+0x0000000000000044 LO
+0x0000000000000048 SFF
+0x000000000000004c LS
+0x0000000000000050 HI
+0x0000000000000054 SFT
+0x0000000000000058 HS
+0x000000000000005c OFT
+
+MOV: lane2a
+0x0000380000000000 LNONE
+0x0000340000000000 L0
+0x0000300000000000 L1
+0x00002c0000000000 L01
+0x0000280000000000 L2
+0x0000240000000000 L02
+0x0000200000000000 L12
+0x00001c0000000000 L3
+0x0000180000000000 L03
+0x0000140000000000 L13
+0x0000100000000000 L013
+0x00000c0000000000 L23
+0x0000080000000000 L023
+0x0000040000000000 L123
+
+MOV32I: lane0e
+0x0000000000038000 LNONE
+0x0000000000034000 L0
+0x0000000000030000 L1
+0x000000000002c000 L01
+0x0000000000028000 L2
+0x0000000000024000 L02
+0x0000000000020000 L12
+0x000000000001c000 L3
+0x0000000000018000 L03
+0x0000000000014000 L13
+0x0000000000010000 L013
+0x000000000000c000 L23
+0x0000000000008000 L023
+0x0000000000004000 L123
+
+DFMA: rnd
+0x0000000000000000 RN
+0x0004000000000000 RM
+0x0008000000000000 RP
+0x000c000000000000 RZ
+
+FFMA: rnd
+0x0000000000000000 RN
+0x0040000000000000 RM
+0x0080000000000000 RP
+0x00c0000000000000 RZ
+
+FFMA, FMUL32I
+0x0100000000000000 FTZ
+
+F2F, F2I, FADD, FMUL, FMNMX
+0x0000800000000000 FTZ
+
+FADD32I
+0x0080000000000000 FTZ
+
+FMUL32I
+0x0020000000000000 FTZ
+
+FSET, FSETP, FCMP, DSET, DSETP
+0x0400000000000000 FTZ
+
+HADD2, HMUL2
+0x0000008000000000 FTZ
+
+HFMA2
+0x0000002000000000 FTZ
+
+FADD, FFMA, FMUL, F2F, I2I, MUFU, IMAD, IADD, ISUB
+0x0020000000000000 SAT
+
+FADD, DADD, FMNMX, DMNMX, MUFU, FFMA, DFMA, FMUL, DADD, DMUL
+0x0008000000000000 r8neg
+
+FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I
+0x0001000000000000 r20neg
+
+FMUL, DMUL, FFMA, DFMA
+0x0001000000000000 r20neg
+
+FFMA, DFMA
+0x0010000000000000 r39neg
+
+FADD, DADD, FMNMX, DMNMX, MUFU
+0x0002000000000000 r8abs
+
+FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I
+0x0010000000000000 r20abs
+
+FSETP, DSETP, FSET, DSET
+0x0000400000000000 r8neg
+0x0100000000000000 r20neg
+0x0200000000000000 r8abs
+0x0000800000000000 r20abs
+
+RRO: func
+0x0000000000000000 SINCOS
+0x0000040000000000 EX2
+
+MUFU: func
+0x0000000000000000 COS
+0x0000000000800000 SIN
+0x0000000001000000 EX2
+0x0000000001800000 LG2
+0x0000000002000000 RCP
+0x0000000002800000 RSQ
+0x0000000003000000 RCP64H
+0x0000000003800000 RSQ64H
+
+FSETP, DSETP, FSET, DSET, FCMP: cmp
+0x0008000000000000 .LT
+0x0010000000000000 .EQ
+0x0018000000000000 .LE
+0x0020000000000000 .GT
+0x0020000000000000
+0x0028000000000000 .NE
+0x0030000000000000 .GE
+0x0038000000000000 .NUM
+0x0040000000000000 .NAN
+0x0048000000000000 .LTU
+0x0050000000000000 .EQU
+0x0058000000000000 .LEU
+0x0060000000000000 .GTU
+0x0068000000000000 .NEU
+0x0070000000000000 .GEU
+
+FSETP, DSETP, FSET, DSET: bool
+0x0000000000000000 AND
+0x0001000000000000 OR
+0x0002000000000000 XOR
+
+HSETP2: cmp
+0x0000002800000000 .NE
+
+HSETP2: bool
+0x0000000000000000 AND
+
+S2R: sr
+0x0000000000000000  LANEID
+0x0000000001000000  VIRTCFG
+0x0000000001800000  VIRTID
+0x0000000002000000  PM0
+0x0000000002800000  PM1
+0x0000000003000000  PM2
+0x0000000003800000  PM3
+0x0000000004000000  PM4
+0x0000000004800000  PM5
+0x0000000005000000  PM6
+0x0000000005800000  PM7
+0x0000000008000000  PRIM_TYPE
+0x0000000008800000  INVOCATION_ID
+0x0000000009000000  Y_DIRECTION
+0x0000000010000000  TID
+0x0000000010800000  TID.X
+0x0000000011000000  TID.Y
+0x0000000011800000  TID.Z
+0x0000000012000000  CTA_PARAM
+0x0000000012800000  CTAID.X
+0x0000000013000000  CTAID.Y
+0x0000000013800000  CTAID.Z
+0x0000000014000000  NTID
+0x0000000014800000  CirQueueIncrMinusOne
+0x0000000015000000  NLATC
+0x0000000015800000  43
+0x0000000016000000  44
+0x0000000016800000  45
+0x0000000017000000  46
+0x0000000017800000  47
+0x0000000018000000  SWINLO
+0x0000000018800000  SWINSZ
+0x0000000019000000  SMEMSZ
+0x0000000019800000  SMEMBANKS
+0x000000001a000000  LWINLO
+0x000000001a800000  LWINSZ
+0x000000001b000000  LMEMLOSZ
+0x000000001b800000  LMEMHIOFF
+0x000000001c000000  EQMASK
+0x000000001c800000  LTMASK
+0x000000001d000000  LEMASK
+0x000000001d800000  GTMASK
+0x000000001e000000  GEMASK
+0x0000000020000000  GLOBALERRORSTATUS
+0x0000000021000000  WARPERRORSTATUS
+0x0000000028000000  CLOCKLO
+0x0000000029000000  GLOBALTIMERLO
+0x0000000029800000  GLOBALTIMERHI
+
+CS2R: sr
+0x0000000005000000 CLOCKLO
+0x0000000005100000 CLOCKHI
+0x0000000005200000 GLOBALTIMERLO
+0x0000000005300000 GLOBALTIMERHI
+
+B2R
+0x0000e00000000000 nop45
+
+BAR: red
+0x0000000000000000 POPC
+0x0000000800000000 AND
+0x0000001000000000 OR
+
+MEMBAR: mode
+0x0000000000000000 CTA
+0x0000000000000100 GL
+0x0000000000000200 SYS
+
+VOTE: mode
+0x0000000000000000 ALL
+0x0008000000000000 ANY
+0x0010000000000000 EQ
+
+VOTE
+0x00000000000003fc nor0
+
+BRA
+0x0000000000000200 U
+
+TLDS: chnls
+0x0010000000000000 RGBA
+
+TLDS
+0x0002000000000000 NODEP
+
+LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS
+0x0000000000000000 nor8
+
+LD, ST: type
+0x0000000000000000 .U8
+0x0100000000000000 .S8
+0x0200000000000000 .U16
+0x0300000000000000 .S16
+0x0400000000000000
+0x0400000000000000 .32
+0x0500000000000000 .64
+0x0600000000000000 .128
+
+LDX: type
+0x0000000000000000 .b32
+0x0004000000000000 .b64
+0x0008000000000000 .b96
+0x000c000000000000 .b128
+
+LD, ST: cache
+0x0000000000000000 CG
+0x1000000000000000 CS
+0x1800000000000000 CV
+0x1800000000000000 WT
+
+STG, LDS, STS, LDL, STL, LDC: type
+0x0000000000000000 .U8
+0x0008000000000000 .S8
+0x0010000000000000 .U16
+0x0018000000000000 .S16
+0x0020000000000000
+0x0020000000000000 .32
+0x0028000000000000 .64
+0x0030000000000000 .128
+
+LDG: type
+0x0000000000000000 .U8
+0x0000800000000000 .S8
+0x0001000000000000 .U16
+0x0001800000000000 .S16
+0x0002000000000000
+0x0002000000000000 .32
+0x0002800800000000 .64
+0x0003003800000000 .128
+
+LDG, STG: cache
+0x0000000000000000 CG
+0x0000000000000000 CI
+0x0000040000000000 CS
+0x0000000000000000 CV
+0x0000000000000000 WT
+
+LDG
+0x0000008000000000 E
+
+LDL: cache
+0x0000200000000000 CI
+
+LDL, STL: cache
+0x0000800000000000 CG
+0x0001000000000000 LU
+0x0001800000000000 CV
+0x0001800000000000 WT
+
+LDC: cache
+0x0000100000000000 IL
+
+STG, LDS, STS, LDL, STL, LDC
+0x0000200000000000 E
+
+LDS
+0x0008000000000000 U
+
+RED: type
+0x0000000000000000
+0x0010000000000000 .S32
+0x0020000000000000 .U64
+0x0030000000000000 .F32.FTZ.RN
+0x0040000000000000 .F16x2.FTZ.RN
+0x0050000000000000 .S64
+
+RED: mode
+0x0000000000000000 ADD
+0x0080000000000000 MIN
+0x0100000000000000 MAX
+0x0180000000000000 INC
+0x0200000000000000 DEC
+0x0280000000000000 AND
+0x0300000000000000 OR
+0x0380000000000000 XOR
+
+ATOM: type
+0x0000000000000000
+0x0002000000000000 .S32
+0x0004000000000000 .U64
+0x0006000000000000 .F32.FTZ.RN
+0x0008000000000000 .F16x2.FTZ.RN
+0x000a000000000000 .S64
+0x0002000000000000 .64
+
+ATOM, RED
+0x0008000000000000 E
+
+LD, ST
+0x0080000000000000 E
+
+ATOM: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x03f0000000000000 CAS
+
+ATOMS: type
+0x0000000000000000
+0x0000000010000000 .S32
+0x0000000020000000 .U64
+0x0000000030000000 .S64
+0x0010000000000000 .64
+
+ATOMS: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x0240000000000000 CAS
+
+BFE:REV2B
+0x0000080000000000 REV2B
+};
+
+our %flags;
+my (@ops, $flag);
+foreach my $line (@flags)
+{
+    if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)')
+    {
+        my $val = hex($1);
+        if ($flag)
+            { $flags{$_}{$flag}{$2} = $val foreach @ops; }
+        else
+            { $flags{$_}{$2}        = $val foreach @ops; }
+    }
+    else
+    {
+        my ($ops, $name) = split ':\s*', $line;
+        @ops = split ',\s*', $ops;
+        $flag = $name;
+    }
+}
+
+sub parseInstruct
+{
+    my ($inst, $grammar) = @_;
+    return unless $inst =~ $grammar->{rule};
+    my %capData = %+;
+    return \%capData;
+}
+
+my %immedOps = map { $_ => 1 } qw(i20 f20 d20);
+my %immedCodes =
+(
+    0x5c => 0x64,
+    0x5b => 0x6d,
+    0x59 => 0x6b,
+    0x58 => 0x68,
+);
+my %constCodes =
+(
+    c20 => 0x2,
+    c39 => 0x1,
+);
+my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4);
+
+sub genReuseCode
+{
+    my $capData = shift;
+    my $reuse = 0;
+    $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes;
+    return $reuse;
+}
+
+sub genCode
+{
+    my ($op, $grammar, $capData, $test) = @_;
+
+    my $flags     = $flags{$op};
+    my $code      = $grammar->{code};
+    my $reuse     = 0;
+
+
+    if (exists $capData->{noPred})
+    {
+        delete $capData->{noPred};
+        push @$test, 'noPred' if $test;
+    }
+    else
+    {
+        my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7;
+        push @$test, 'predNum' if $test;
+        if (exists $capData->{predNot})
+        {
+            $p |= 8;
+            push @$test, 'predNot' if $test;
+        }
+        $code |= $p << 18;
+        delete @{$capData}{qw(predNum predNot)};
+
+    }
+    foreach my $rcode (qw(reuse1 reuse2 reuse3))
+    {
+        if (delete $capData->{$rcode})
+        {
+            $reuse |= $reuseCodes{$rcode};
+            push @$test, $rcode if $test;
+        }
+    }
+
+    foreach my $capture (keys %$capData)
+    {
+        if (exists $constCodes{$capture})
+            { $code ^= $constCodes{$capture} << 62; }
+
+        if (exists $operands{$capture})
+        {
+            unless ($capture eq 'r20' && exists $capData->{r39s20})
+            {
+                $code ^= $operands{$capture}->($capData->{$capture});
+                push @$test, $capture if $test;
+            }
+        }
+
+        if (exists $flags->{$capture})
+        {
+            if (ref $flags->{$capture})
+            {
+                $code ^= $flags->{$capture}{$capData->{$capture}};
+                push @$test, "$capture:$capData->{$capture}" if $test;
+            }
+            else
+            {
+                $code ^= $flags->{$capture};
+                push @$test, $capture if $test;
+            }
+        }
+        elsif (!exists $operands{$capture} && !$test)
+        {
+            warn "UNUSED: $op: $capture: $capData->{$capture}\n";
+            warn Dumper($flags);
+        }
+    }
+
+    return $code, $reuse;
+}
+
+
+my $CtrlRe = qr'(?<ctrl>[T\-]:[G\-]:[D\-]:[S\-]:[0-9]{2})';
+my $PredRe = qr'(?<pred>@!?(?<predReg>P\d)\s+)';
+my $InstRe = qr"$PredRe?(?<op>\w+)(?<rest>[^;]*;)"o;
+my $CommRe = qr'(?<comment>.*)';
+
+sub processAsmLine
+{
+    my ($line, $lineNum) = @_;
+
+    if ($line =~ m"^$CtrlRe(?<space>\s+)$InstRe$CommRe"o)
+    {
+        return {
+            lineNum => $lineNum,
+            pred    => $+{pred},
+            predReg => $+{predReg},
+            space   => $+{space},
+            op      => $+{op},
+            comment => $+{comment},
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            ctrl    => readCtrl($+{ctrl}, $line),
+        };
+    }
+    return undef;
+}
+
+sub processSassLine
+{
+    my $line = shift;
+
+    if ($line =~ m"^\s+/\*(?<num>[0-9a-f]+)\*/\s+$InstRe\s+/\* (?<code>0x[0-9a-f]+)"o)
+    {
+        return {
+            num     => hex($+{num}),
+            pred    => $+{pred},
+            op      => $+{op},
+            ins     => normalizeSpacing($+{op} . $+{rest}),
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            code    => hex($+{code}),
+        };
+    }
+    return undef;
+}
+
+sub processSassCtrlLine
+{
+    my ($line, $ctrl, $ruse) = @_;
+
+    return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)';
+
+    my $code = hex($1);
+    if (ref $ctrl)
+    {
+        push @$ctrl, ($code & 0x00000000000003fc) >> 2;
+        push @$ctrl, ($code & 0x000000000003fc00) >> 10;
+        push @$ctrl, ($code & 0x0000000003fc0000) >> 18;
+        push @$ctrl, ($code & 0x00000003fc000000) >> 26;
+        push @$ctrl, ($code & 0x000003fc00000000) >> 34;
+        push @$ctrl, ($code & 0x0003fc0000000000) >> 42;
+        push @$ctrl, ($code & 0x03fc000000000000) >> 50;
+    }
+    if (ref $ruse)
+    {
+        push @$ruse, ($code & 0x00000000001e0000) >> 17;
+        push @$ruse, ($code & 0x000003c000000000) >> 38;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+        push @$ruse, ($code & 0x00000000001e0000) >> 17;
+        push @$ruse, ($code & 0x000003c000000000) >> 38;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+    }
+    return 1;
+}
+
+sub replaceXMADs
+{
+    my $file = shift;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD\.LO\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<x>\w+)\s*;$CommRe/
+
+        die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s
+%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s;
+%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;',
+                @+{qw(ctrl space pred d a b c x comment)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>-?$immed|\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/
+
+        die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<c>\w+)\s*;$CommRe/
+
+        die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    return $file;
+}
+sub normalizeSpacing
+{
+    my $inst = shift;
+    $inst =~ s/\t/ /g;
+    $inst =~ s/\s{2,}/ /g;
+    return $inst;
+}
+
+
+sub printCtrl
+{
+    my $code = shift;
+
+    my $stall = ($code & 0x0f) >> 0;
+    my $sharedbar = ($code & 0x10) >> 4;
+    my $dual_issue = ($code & 0x20) >> 5;
+    my $globalbar = ($code & 0x40) >> 6;
+    my $texbar = ($code & 0x80) >> 7;
+
+    $texbar = $texbar ? 'T' : '-';
+    $globalbar = $globalbar ? 'G' : '-';
+    $dual_issue = $dual_issue ? '-' : 'D';
+    $sharedbar = $sharedbar ? 'S' : '-';
+    $stall = sprintf('%02d', $stall);
+    return sprintf '%s:%s:%s:%s:%02d', $texbar, $globalbar, $dual_issue, $sharedbar, $stall;
+}
+sub readCtrl
+{
+    my ($ctrl, $context) = @_;
+    my ($texbar, $globalbar, $dual_issue, $sharedbar, $stall) = split ':', $ctrl;
+
+    $texbar= $texbar eq 'T' ? 1 : 0;
+    $globalbar= $globalbar eq 'G' ? 1 : 0;
+    $dual_issue= $dual_issue eq 'D' ? 0 : 1;
+    $sharedbar= $sharedbar eq 'S' ? 1 : 0;
+    $stall = sprintf("%d", $stall);
+
+
+
+    return
+        $texbar << 7 |
+        $globalbar << 6 |
+        $dual_issue << 5 |
+        $sharedbar << 4 |
+        $stall;
+}
+
+sub getRegNum
+{
+    my ($regMap, $regName) = @_;
+
+    return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName};
+}
+
+sub getVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r0} or return;
+
+    return if $regName eq 'RZ';
+
+    if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+3);
+        }
+        confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4;
+        return @{$vectors->{$regName}};
+    }
+    return $regName;
+}
+
+sub getAddrVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r8} or return;
+
+    return if $regName eq 'RZ';
+
+    if (exists $capData->{E})
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        print Dumper($vectors) unless exists $vectors->{$regName};
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    return $regName;
+}
+
+__END__
+
+
+
diff --git a/Assembler/KeplerAs/blib/lib/auto/KeplerAs/KeplerAs/.exists b/Assembler/KeplerAs/blib/lib/auto/KeplerAs/KeplerAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/KeplerAs/blib/man1/.exists b/Assembler/KeplerAs/blib/man1/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/KeplerAs/blib/man3/.exists b/Assembler/KeplerAs/blib/man3/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/KeplerAs/blib/man3/KeplerAs::KeplerAs.3pm b/Assembler/KeplerAs/blib/man3/KeplerAs::KeplerAs.3pm
new file mode 100644
index 0000000..e7e96d8
--- /dev/null
+++ b/Assembler/KeplerAs/blib/man3/KeplerAs::KeplerAs.3pm
@@ -0,0 +1,117 @@
+.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is turned on, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.nr rF 0
+.if \n(.g .if rF .nr rF 1
+.if (\n(rF:(\n(.g==0)) \{
+.    if \nF \{
+.        de IX
+.        tm Index:\\$1\t\\n%\t"\\$2"
+..
+.        if !\nF==2 \{
+.            nr % 0
+.            nr F 2
+.        \}
+.    \}
+.\}
+.rr rF
+.\" ========================================================================
+.\"
+.IX Title "KeplerAs::KeplerAs 3pm"
+.TH KeplerAs::KeplerAs 3pm "2018-11-05" "perl v5.22.1" "User Contributed Perl Documentation"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+KeplerAs::KeplerAs \- Assembler for NVIDIA Maxwell architecture
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\&    KeplerAs.pl [opts]
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+See the documentation at: https://github.com/NervanaSystems/KeplerAs
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+See the documentation at: https://github.com/NervanaSystems/KeplerAs
+.SH "AUTHOR"
+.IX Header "AUTHOR"
+Scott Gray, <sgray@nervanasys.com<gt>
+.SH "COPYRIGHT AND LICENSE"
+.IX Header "COPYRIGHT AND LICENSE"
+The \s-1MIT\s0 License (\s-1MIT\s0)
+.PP
+Copyright (c) 2014 Scott Gray
+.PP
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the \*(L"Software\*(R"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+.PP
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+.PP
+\&\s-1THE SOFTWARE IS PROVIDED \*(L"AS IS\*(R", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.\s0
diff --git a/Assembler/KeplerAs/blib/script/.exists b/Assembler/KeplerAs/blib/script/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/KeplerAs/blib/script/KeplerAs.pl b/Assembler/KeplerAs/blib/script/KeplerAs.pl
new file mode 100755
index 0000000..268cc85
--- /dev/null
+++ b/Assembler/KeplerAs/blib/script/KeplerAs.pl
@@ -0,0 +1,275 @@
+#!/usr/bin/perl
+use strict;
+use KeplerAs::Cubin;
+use KeplerAs::KeplerAs;
+use Data::Dumper;
+use File::Spec;
+
+require 5.10.0;
+
+$Data::Dumper::Sortkeys = 1;
+
+my $mode = shift;
+
+if ($mode =~ /^\-?\-l/i)
+{
+    my $cubinFile = shift or usage();
+
+    my $cubin = KeplerAs::Cubin->new($cubinFile);
+
+    my $arch    = $cubin->arch;
+    my $class   = $cubin->class;
+    my $asize   = $cubin->address_size;
+    my $kernels = $cubin->listKernels;
+    my $symbols = $cubin->listSymbols;
+
+    printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize;
+
+    foreach my $ker (sort keys %$kernels)
+    {
+        printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)};
+    }
+    foreach my $sym (sort keys %$symbols)
+    {
+        printf "Symbol: %s\n", $sym;
+    }
+}
+elsif ($mode =~ /^\-?\-t/i)
+{
+    my $reg  = shift if $ARGV[0] =~ /^\-?\-r/i;
+    my $all  = shift if $ARGV[0] =~ /^\-?\-a/i;
+    my $file = shift or usage();
+    my $fh;
+    if (-T $file)
+    {
+        open $fh, $file or die "$file: $!";
+    }
+    else
+    {
+        my $cubin = KeplerAs::Cubin->new($file);
+        my $arch  = $cubin->arch;
+
+        open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!";
+        my $first = <$fh>;
+        if ($first =~ /cuobjdump fatal/)
+        {
+            print $first;
+            exit(1);
+        }
+    }
+    exit(KeplerAs::KeplerAs::Test($fh, $reg, $all) ? 1 : 0);
+}
+elsif ($mode =~ /^\-?\-e/i)
+{
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $cubinFile = shift or usage();
+    my $asmFile   = shift;
+    my $cubin     = KeplerAs::Cubin->new($cubinFile);
+    my $arch      = $cubin->arch;
+    my $kernels   = $cubin->listKernels;
+
+    $kernelName ||= (sort keys %$kernels)[0];
+
+    my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName";
+
+    open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_35 -sass -fun $kernelName $cubinFile: $!";
+    my $first = <$in>;
+    if ($first =~ /cuobjdump fatal/)
+    {
+        print $first;
+        exit(1);
+    }
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n";
+
+    print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt));
+
+    print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n";
+
+    print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params};
+
+    print $out "#\n# Instructions:\n\n";
+
+    KeplerAs::KeplerAs::Extract($in, $out, $kernel->{Params});
+
+    close $out if $asmFile;
+    close $in;
+}
+elsif ($mode =~ /^\-?\-s/i)
+{
+    my $sassFile  = shift or usage();
+    my $asmFile   = shift;
+
+    open my $in, $sassFile or die "$sassFile: $!";
+
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    KeplerAs::KeplerAs::Extract($in, $out, []);
+
+    close $out if $asmFile;
+    close $in;
+}
+elsif ($mode =~ /^\-?\-i/i)
+{
+    my $nowarn;
+    if ($ARGV[0] =~ /^\-?\-w/i)
+    {
+        $nowarn = shift;
+    }
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $noReuse   = shift if $ARGV[0] =~ /^\-?\-n/i;
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package KeplerAs::KeplerAs::CODE; our \$$name = '$value';"
+    }
+
+    my $asmFile   = shift or usage();
+    my $cubinFile = shift or usage();
+    my $newCubin  = shift || $cubinFile;
+
+    my $file;
+    if (open my $fh, $asmFile)
+    {
+        local $/;
+        $file = <$fh>;
+        close $fh;
+    }
+    else { die "$asmFile: $!" }
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName;
+    die "asm file missing kernel name or is badly formatted" unless $kernelName;
+
+    my $kernel = KeplerAs::KeplerAs::Assemble($file, $include, !$noReuse, $nowarn);
+
+    my $cubin  = KeplerAs::Cubin->new($cubinFile);
+    $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName";
+
+    $cubin->modifyKernel(%$kernel);
+
+    $cubin->write($newCubin);
+
+    printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n",
+        @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)};
+
+}
+elsif ($mode =~ /^\-?\-p/i)
+{
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package KeplerAs::KeplerAs::CODE; our \$$name = '$value';";
+    }
+    my $debug     = shift if $ARGV[0] =~ /^\-?\-d/i;
+    my $asmFile   = shift or usage();
+    my $asmFile2  = shift;
+
+    die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2;
+
+    open my $fh,  $asmFile or die "$asmFile: $!";
+    local $/;
+    my $file = <$fh>;
+    close $fh;
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    if ($asmFile2)
+    {
+        open $fh, ">$asmFile2" or die "$asmFile2: $!";
+    }
+    else
+    {
+        $fh = \*STDOUT;
+    }
+    print $fh KeplerAs::KeplerAs::Preprocess($file, $include, $debug);
+    close $fh;
+}
+elsif ($mode =~ /^\-?\-v/i)
+{
+    print "$KeplerAs::KeplerAs::VERSION\n";
+}
+else
+{
+    print "$mode\n";
+    usage();
+}
+
+exit(0);
+
+
+
+sub usage
+{
+    print <<EOF;
+Usage:
+
+  List kernels and symbols:
+
+    KeplerAs.pl --list|-l <cubin_file>
+
+  Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes.
+  Also useful for extending the missing grammar rules.  Defaults to only showing failures without --all.
+  With the --reg flag it will show register bank conflicts not hidden by reuse flags.
+
+    KeplerAs.pl --test|-t [--reg|-r] [--all|-a] <cubin_file | cuobjdump_sass_file>
+
+  Extract a single kernel into an asm file from a cubin.
+  Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin.
+
+    KeplerAs.pl --extract|-e [--kernel|-k kernel_name] <cubin_file> [asm_file]
+
+  Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes.
+  Include the debug flag to print out detailed scheduler info.
+
+    KeplerAs.pl --pre|-p [--debug|-d] <asm_file> [new_asm_file]
+
+  Insert the kernel asm back into the cubin.  Overwrite existing or create new cubin.
+  Optionally you can skip register reuse flag auto insertion.  This allows you to observe
+  performance without any reuse or you can use it to set the flags manually in your sass.
+
+    KeplerAs.pl --insert|-i [--noreuse|-n] <asm_file> <cubin_file> [new_cubin_file]
+
+  Display version information and exit:
+
+    KeplerAs.pl --version|-v
+
+EOF
+    exit(1);
+}
+
+__END__
diff --git a/Assembler/KeplerAs/lib/KeplerAs/Cubin.pm b/Assembler/KeplerAs/lib/KeplerAs/Cubin.pm
new file mode 100644
index 0000000..867342d
--- /dev/null
+++ b/Assembler/KeplerAs/lib/KeplerAs/Cubin.pm
@@ -0,0 +1,604 @@
+package KeplerAs::Cubin;
+
+use strict;
+use Data::Dumper;
+
+my @Elf32_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    L   entry
+    L   phOffset
+    L   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf64_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    Q   entry
+    Q   phOffset
+    Q   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf32_PrgHdr = qw(
+    L   type
+    L   offset
+    L   vaddr
+    L   paddr
+    L   fileSize
+    L   memSize
+    L   flags
+    L   align
+);
+my @Elf64_PrgHdr = qw(
+    L   type
+    L   flags
+    Q   offset
+    Q   vaddr
+    Q   paddr
+    Q   fileSize
+    Q   memSize
+    Q   align
+);
+my @Elf32_SecHdr = qw(
+    L   name
+    L   type
+    L   flags
+    L   addr
+    L   offset
+    L   size
+    L   link
+    L   info
+    L   align
+    L   entSize
+);
+my @Elf64_SecHdr = qw(
+    L   name
+    L   type
+    Q   flags
+    Q   addr
+    Q   offset
+    Q   size
+    L   link
+    L   info
+    Q   align
+    Q   entSize
+);
+my @Elf32_SymEnt = qw(
+    L   name
+    L   value
+    L   size
+    C   info
+    C   other
+    S   shIndx
+);
+my @Elf64_SymEnt = qw(
+    L   name
+    C   info
+    C   other
+    S   shIndx
+    Q   value
+    Q   size
+);
+my @symBind = qw(LOCAL GLOBAL WEAK);
+
+my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC);
+
+$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr;
+$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr;
+$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr;
+$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt;
+
+$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr;
+$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr;
+$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr;
+$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt;
+
+$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr    ];
+$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ];
+$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ];
+$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ];
+
+$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr    ];
+$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ];
+$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ];
+$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ];
+
+sub new
+{
+    my ($package, $file) = @_;
+
+    my $cubin = bless { fileName => $file }, $package;
+
+    open my $fh, $file or die "$file: $!";
+    binmode($fh);
+
+    my $data;
+    read $fh, $data, 0x34;
+    my $elfHdr = $cubin->{elfHdr} = {};
+    @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data;
+
+    my $class = $elfHdr->{fileClass};
+
+    if ($class == 2)
+    {
+        seek $fh, 0, 0;
+        read $fh, $data, 0x46;
+        @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data;
+
+        $cubin->{Class} = 64;
+    }
+    else
+    {
+        $cubin->{Class} = 32;
+    }
+
+    $cubin->{Arch} = "35";
+    die "Cubin not in sm_35. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} != 35;
+    $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32;
+
+    seek $fh, $elfHdr->{phOffset}, 0;
+    foreach (1 .. $elfHdr->{phNum})
+    {
+        read $fh, $data, $elfHdr->{phEntSize};
+
+        my %prgHdr = (Indx => $_ - 1);
+        @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data;
+        push @{$cubin->{prgHdrs}}, \%prgHdr;
+    }
+
+    seek $fh, $elfHdr->{shOffset}, 0;
+    foreach (1 .. $elfHdr->{shNum})
+    {
+        read $fh, $data, $elfHdr->{shEntSize};
+
+        my %secHdr = (Indx => $_ - 1);
+        @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data;
+        push @{$cubin->{secHdrs}}, \%secHdr;
+    }
+
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $data = '';
+        if ($secHdr->{size} && $secHdr->{type} != 8)
+        {
+            seek $fh, $secHdr->{offset}, 0;
+            read $fh, $data, $secHdr->{size};
+        }
+        if ($secHdr->{type} == 3) # STRTAB
+        {
+            my $strTab = $secHdr->{StrTab} = {};
+            my $indx   = 0;
+            foreach my $str (split "\0", $data)
+            {
+                $strTab->{$indx} = $str;
+                $indx += 1 + length($str);
+            }
+        }
+        if ($secHdr->{type} == 2) # SYMTAB
+        {
+            my $offset = 0;
+            while ($offset < $secHdr->{size})
+            {
+                my $symEnt = {};
+                @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize});
+                $offset += $secHdr->{entSize};
+
+                push @{$secHdr->{SymTab}}, $symEnt;
+            }
+        }
+        $secHdr->{Data} = unpack 'H*', $data;
+    }
+    close $fh;
+
+    my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab};
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $secHdr->{Name} = $shStrTab->{$secHdr->{name}};
+        $cubin->{$secHdr->{Name}} = $secHdr;
+    }
+
+    my $strTab = $cubin->{'.strtab'}{StrTab};
+    foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}})
+    {
+        $symEnt->{Name} = $strTab->{$symEnt->{name}};
+
+        my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}];
+        $secHdr->{SymbolEnt} = $symEnt;
+
+        if (($symEnt->{info} & 0x0f) == 0x02)
+        {
+            my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr;
+
+            $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4];
+
+            $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ];
+
+            $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20;
+
+            $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24;
+
+            my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"};
+            $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0;
+
+            $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"};
+
+            my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"};
+            if ($paramSec)
+            {
+                my @data = unpack "L*", pack "H*", $paramSec->{Data};
+
+                $paramSec->{ParamData} = \@data;
+                $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ];
+
+                my $idx = 0;
+                $idx++ while $idx < @data && $data[$idx] != 0x00080a04;
+
+                my $first = $data[$idx+2] & 0xFFFF;
+                $idx += 4;
+
+                my @params;
+                while ($idx < @data && $data[$idx] == 0x000c1704)
+                {
+                    my $ord    = $data[$idx+2] & 0xFFFF;
+                    my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16);
+                    my $psize  = $data[$idx+3] >> 18;
+                    my $align  = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0;
+                    unshift @params, "$ord:$offset:$psize:$align";
+                    $idx += 4;
+                }
+                my @staticParams = @data[0 .. ($idx-1)];
+
+
+                my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize);
+                while ($idx < @data)
+                {
+                    my $code = $data[$idx] & 0xffff;
+                    my $size = $data[$idx] >> 16;
+                    $idx++;
+
+
+                    if ($code == 0x1b03)
+                    {
+                        $maxregCount = $size;
+                    }
+                    elsif ($code == 0x1d04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @ctaidOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    elsif ($code == 0x1c04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @exitOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    elsif ($code == 0x0401)
+                    {
+                        $ctaidzUsed = 1;
+                    }
+                    elsif ($code == 0x1004)
+                    {
+                        while ($size > 0)
+                        {
+                            push @reqntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    elsif ($code == 0x0504)
+                    {
+                        while ($size > 0)
+                        {
+                            push @maxntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    elsif ($code == 0x1e04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @stackSize, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    else
+                    {
+                        printf "Unknown Code 0x%02x (size:%d)\n", $code, $size;
+                    }
+                }
+                $kernelSec->{Params}   = \@params;
+                $kernelSec->{ParamCnt} = scalar @params;
+
+                $paramSec->{StaticParams} = \@staticParams;
+                $paramSec->{MAXREG_COUNT} = $maxregCount;
+                $paramSec->{ExitOffsets}  = \@exitOffsets;
+                $paramSec->{CTAIDOffsets} = \@ctaidOffsets;
+                $paramSec->{CTAIDZUsed}   = $ctaidzUsed;
+                $paramSec->{REQNTID}      = \@reqntid;
+                $paramSec->{MAXNTID}      = \@maxntid;
+                $paramSec->{STACKSIZE}    = \@stackSize;
+            }
+        }
+        elsif (($symEnt->{info} & 0x10) == 0x10)
+        {
+            $cubin->{Symbols}{$symEnt->{Name}} = $symEnt;
+        }
+    }
+
+
+    return $cubin;
+}
+sub class
+{
+    return shift()->{Class};
+}
+sub arch
+{
+    return shift()->{Arch};
+}
+sub address_size
+{
+    return shift()->{AddressSize};
+}
+sub listKernels
+{
+    return shift()->{Kernels};
+}
+sub listSymbols
+{
+    return shift()->{Symbols};
+}
+sub getKernel
+{
+    my ($cubin, $kernel) = @_;
+    return $cubin->{Kernels}{$kernel};
+}
+
+sub modifyKernel
+{
+    my ($cubin, %params) = @_;
+
+    my $kernelSec    = $params{Kernel};
+    my $newReg       = $params{RegCnt};
+    my $newBar       = $params{BarCnt};
+    my $exitOffsets  = $params{ExitOffsets};
+    my $ctaidOffsets = $params{CTAIDOffsets};
+    my $ctaidzUsed   = $params{CTAIDZUsed};
+    my $newData      = $params{KernelData};
+    my $newSize      = @$newData * 8;
+
+    die "255 register max" if $newReg > 255;
+    die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63;
+    die "16 is max barrier count" if $newBar > 16;
+
+    my $paramSec = $kernelSec->{ParamSec};
+    my $kernelName = $kernelSec->{SymbolEnt}{Name};
+    my $maxregCount = $paramSec->{MAXREG_COUNT};
+    my $stackSize   = $paramSec->{STACKSIZE};
+
+    $kernelSec->{KernelData} = $newData;
+    $kernelSec->{Data}       = unpack "H*", pack "Q*", @$newData;
+
+    if ($newReg != $kernelSec->{RegCnt})
+    {
+        print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n";
+        $kernelSec->{RegCnt} = $newReg;
+        $kernelSec->{info}  &= ~0xff000000;
+        $kernelSec->{info}  |= $newReg << 24;
+    }
+    if ($newBar != $kernelSec->{BarCnt})
+    {
+        print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n";
+        $kernelSec->{BarCnt} = $newBar;
+        $kernelSec->{flags} &= ~0x01f00000;
+        $kernelSec->{flags} |=  $newBar << 20;
+    }
+
+    my @paramData = @{$paramSec->{StaticParams}};
+    if (defined $maxregCount)
+    {
+        push @paramData, ($maxregCount << 16) | 0x1b03;
+    }
+
+
+    my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets;
+    my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}};
+
+    if ($newCTAIDs ne $oldCTAIDs)
+    {
+        print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n";
+    }
+    if (@$ctaidOffsets)
+    {
+        push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04;
+        push @paramData, @$ctaidOffsets;
+    }
+
+    my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets;
+    my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}};
+
+    if ($newExits ne $oldExits)
+    {
+        print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n";
+    }
+    if (@$exitOffsets)
+    {
+        push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04;
+        push @paramData, @$exitOffsets;
+    }
+
+    if ($ctaidzUsed != $paramSec->{CTAIDZUsed})
+    {
+        print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n";
+    }
+    if ($ctaidzUsed)
+    {
+        push @paramData, 0x0401;
+    }
+
+    if (@{$paramSec->{REQNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004;
+        push @paramData, @{$paramSec->{REQNTID}};
+    }
+    if (@{$paramSec->{MAXNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504;
+        push @paramData, @{$paramSec->{MAXNTID}};
+    }
+    if (@$stackSize)
+    {
+        push @paramData, (scalar(@$stackSize) << 18) | 0x1e04;
+        push @paramData, @$stackSize;
+    }
+
+    my $newParamSize  = scalar(@paramData)*4;
+    $paramSec->{Data} = unpack "H*", pack "L*", @paramData;
+    if ($newParamSize != $paramSec->{size})
+    {
+        print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n";
+        $cubin->updateSize($paramSec, $newParamSize);
+    }
+
+    if ($newSize != $kernelSec->{size})
+    {
+        print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n";
+        $cubin->updateSize($kernelSec, $newSize, 1);
+    }
+}
+
+sub updateSize
+{
+    my ($cubin, $sec, $newSize, $updatePrgSize) = @_;
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    my $delta = $newSize - $sec->{size};
+    $sec->{size} = $newSize;
+
+    if ($sec->{SymbolEnt})
+    {
+        $sec->{SymbolEnt}{size} = $newSize;
+        my $symSection = $cubin->{'.symtab'};
+        $symSection->{Data} = '';
+        foreach my $symEnt (@{$symSection->{SymTab}})
+        {
+            $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}};
+        }
+    }
+
+    my $pos = $elfHdr->{ehSize};
+    my %sizeMap;
+
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        next if $secHdr->{align} == 0;
+
+        my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size};
+
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pos += $secHdr->{align} - $pad;
+        }
+        $sizeMap{$secHdr->{offset}} = $pos;
+
+        $secHdr->{offset} = $pos;
+
+        $pos += $size;
+    }
+
+    my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset};
+
+    $sizeMap{$elfHdr->{shOffset}} = $pos;
+    $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize;
+
+    $elfHdr->{shOffset} = $pos;
+    $elfHdr->{phOffset} = $pos + $shSize;
+
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}};
+
+        if ($updatePrgSize && $prgHdr->{type} == 1 &&
+            $sec->{offset} >= $prgHdr->{offset} &&
+            $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta)
+        {
+            $prgHdr->{fileSize} += $delta;
+            $prgHdr->{memSize}  += $delta;
+        }
+    }
+}
+
+sub write
+{
+    my ($cubin, $file) = @_;
+
+    open my $fh, ">$file" or die "Error: could not open $file for writing: $!";
+    binmode($fh);
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}};
+    my $pos = $elfHdr->{ehSize};
+
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        next if $secHdr->{size} == 0 || $secHdr->{type} == 8;
+
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pad = $secHdr->{align} - $pad;
+            print $fh join '', "\0" x $pad;
+            $pos += $pad;
+        }
+
+        print $fh pack 'H*', $secHdr->{Data};
+        $pos += $secHdr->{size};
+    }
+
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}};
+    }
+
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}};
+    }
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/KeplerAs/lib/KeplerAs/KeplerAs.pm b/Assembler/KeplerAs/lib/KeplerAs/KeplerAs.pm
new file mode 100644
index 0000000..34dfbcd
--- /dev/null
+++ b/Assembler/KeplerAs/lib/KeplerAs/KeplerAs.pm
@@ -0,0 +1,1196 @@
+package KeplerAs::KeplerAs;
+
+require 5.10.0;
+
+use strict;
+use Data::Dumper;
+use KeplerAs::KeplerAsGrammar;
+use File::Spec;
+use Carp;
+
+our $VERSION = '1.06';
+
+my %relOffset  = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT);
+
+my %absOffset  = map { $_ => 1 } qw(JCAL);
+
+my %jumpOp     = (%relOffset, %absOffset);
+
+my %noDest     = map { $_ => 1 } qw(ST STG STS STL RED);
+
+my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4);
+
+sub Assemble
+{
+    my ($file, $include, $doReuse, $nowarn) = @_;
+
+    my $regMap = {};
+    $file = Preprocess($file, $include, 0, $regMap);
+    my $vectors = delete $regMap->{__vectors};
+    my $regBank = delete $regMap->{__regbank};
+
+    my $regCnt = 0;
+    my $barCnt = 0;
+
+    my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse);
+
+    push @instructs, $ctrl = {};
+
+    foreach my $line (split "\n", $file)
+    {
+        $lineNum++;
+
+        next unless preProcessLine($line);
+
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+
+            push @branches, @instructs+0 if exists $jumpOp{$inst->{op}};
+
+            push @{$ctrl->{ctrl}}, $inst->{ctrl};
+
+            $inst->{ctrl} = $ctrl;
+
+            push @instructs, $inst;
+            push @instructs, $ctrl = {} if ((@instructs & 7) == 0);
+        }
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            $labels{$1} = @instructs+0;
+        }
+        else
+        {
+            die "badly formed line at $lineNum: $line\n";
+        }
+    }
+    push @{$ctrl->{ctrl}}, 0x00;
+    push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' };
+    while (@instructs & 7)
+    {
+        push @instructs, $ctrl = {} if ((@instructs & 7) == 0);
+        push @{$ctrl->{ctrl}}, 0x00;
+        push @instructs, { op => 'NOP', inst => 'NOP;' };
+    }
+
+    foreach my $i (@branches)
+    {
+        if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1})
+            { die "instruction has invalid label: $instructs[$i]{inst}"; }
+
+        $instructs[$i]{jump} = $labels{$1};
+
+        if (exists $relOffset{$instructs[$i]{op}})
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; }
+        else
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; }
+    }
+
+    foreach my $i (0 .. $#instructs)
+    {
+        next unless $i & 7;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            if ($doReuse)
+            {
+                my @r0 = getVecRegisters($vectors, $capData);
+
+
+                if (@r0 && !exists $noDest{$op})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        if (my $reuse = $reuse{$slot})
+                        {
+                            delete $reuse->{$_} foreach @r0;
+                        }
+                    }
+                }
+                %reuse = () if exists $jumpOp{$op};
+
+                if ($gram->{type}{reuse})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        next unless exists $capData->{$slot};
+
+                        my $r = $capData->{$slot};
+                        next if $r eq 'RZ';
+                        next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction
+
+                        my $reuse = $reuse{$slot} ||= {};
+
+                        if (my $p = $reuse->{$r})
+                        {
+                            $instructs[$p]{ctrl}{reuse}[($p & 7) - 1] |= $reuseSlots{$slot};
+
+                        }
+                        elsif (keys %$reuse > 2)
+                        {
+                            my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0];
+                            delete $reuse->{$oldest};
+                        }
+                        $reuse->{$r} = $i;
+                    }
+                }
+            }
+            elsif ($gram->{type}{reuse})
+            {
+                $ctrl->{reuse}[($i & 7) - 1] = genReuseCode($capData);
+            }
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    foreach my $r (sort keys %$regBank)
+    {
+        my $bank  = $regBank->{$r};
+        my $avail = $regMap->{$r};
+        foreach my $pos (0 .. $#$avail)
+        {
+            if ($bank == ($avail->[$pos] & 7))
+            {
+                $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+                last;
+            }
+        }
+    }
+
+    my (%liveTime, %pairedBanks, %reuseHistory);
+    foreach my $i (0 .. $#instructs)
+    {
+        next unless $i & 7;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $capData   = parseInstruct($inst, $gram) or next;
+            my $reuseType = $gram->{type}{reuse};
+
+            my (%addReuse, %delReuse);
+            foreach my $slot (qw(r8 r20 r39))
+            {
+                my $r = $capData->{$slot} or next;
+                next if $r eq 'RZ';
+
+                my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r};
+
+                if (my $liveTime = $liveTime{$liveR})
+                {
+                    push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                }
+                else
+                {
+                    warn "register used without initialization ($r): $inst\n" unless $nowarn;
+                    push @{$liveTime{$liveR}}, [$i,$i];
+                }
+
+                my $slotHist  = $reuseHistory{$slot} ||= {};
+                my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0;
+
+
+                if (!$selfReuse && ref $regMap->{$r})
+                {
+                    foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39))
+                    {
+                        my $r2 = $capData->{$slot2};
+                        next if $r2 eq 'RZ' || $r2 eq $r;
+
+                        my $slotHist2 = $reuseHistory{$slot2} ||= {};
+
+
+                        if (!$reuseType || !exists $slotHist2->{$r2})
+                        {
+                            if (ref $regMap->{$r2})
+                            {
+                                push @{$pairedBanks{$r}{pairs}}, $r2;
+                                $pairedBanks{$r}{banks} ||= [];
+                            }
+                            else
+                            {
+                                my $bank = substr($regMap->{$r2},1) & 7;
+
+                                $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++;
+                                $pairedBanks{$r}{pairs} ||= [];
+                            }
+                            $pairedBanks{$r}{useCnt}++;
+                        }
+                    }
+                }
+                if ($reuseType)
+                {
+                    if ($ctrl->{reuse}[($i & 7) - 1] & $reuseSlots{$slot})
+                        { $addReuse{$slot} = $r; }
+                    else
+                        { $delReuse{$slot} = $r; }
+                }
+            }
+            $reuseHistory{$_}{$addReuse{$_}} = 1    foreach keys %addReuse;
+            delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse;
+
+            foreach my $r0 (getVecRegisters($vectors, $capData))
+            {
+                my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0};
+
+                if (exists $noDest{$op})
+                {
+                    if (my $liveTime = $liveTime{$liveR})
+                    {
+                        push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                    }
+                    else
+                    {
+                        warn "register used without initialization ($r0): $inst\n" unless $nowarn;
+                        push @{$liveTime{$liveR}}, [$i,$i];
+                    }
+                }
+                elsif (my $liveTime = $liveTime{$liveR})
+                {
+                    if ($i > $liveTime->[$#$liveTime][1])
+                    {
+                        push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                    }
+                }
+                else
+                {
+                    push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                }
+            }
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    foreach my $r (sort {
+                    $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} ||
+                    $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} ||
+                    $a cmp $b
+                  } keys %pairedBanks)
+    {
+        my $banks = $pairedBanks{$r}{banks};
+        my $avail = $regMap->{$r};
+
+
+        BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..7))
+        {
+            foreach my $pos (0 .. $#$avail)
+            {
+                if ($bank == ($avail->[$pos] & 7))
+                {
+                    $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+
+                    $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}};
+                    last BANK;
+                }
+            }
+        }
+    }
+    foreach my $r (sort keys %$regMap)
+    {
+        if (ref($regMap->{$r}) eq 'ARRAY')
+        {
+            $regMap->{$r} = 'R' . shift @{$regMap->{$r}};
+        }
+    }
+
+    foreach my $i (0 .. $#instructs)
+    {
+        next unless $i & 7;
+
+        $instructs[$i]{orig} = $instructs[$i]{inst};
+        $instructs[$i]{inst} =~ s/(?<!\.)\b(\w+)\b(?!\[)/ exists($regMap->{$1}) ? $regMap->{$1} : $1 /ge;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            foreach my $r (qw(r0 r8 r20 r39))
+            {
+                next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ';
+
+                my $val = substr $capData->{$r}, 1;
+
+                my @r0 = getVecRegisters($vectors, $capData);
+                my @r8 = getAddrVecRegisters($vectors, $capData);
+
+                my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1;
+                my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1;
+
+                if ($val + $regInc > $regCnt)
+                {
+                    $regCnt = $val + $regInc;
+                }
+            }
+            if ($op eq 'BAR')
+            {
+                if (exists $capData->{i8w4})
+                {
+                    $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt;
+                }
+                elsif (exists $capData->{r8})
+                {
+                    $barCnt = 16;
+                }
+            }
+            my ($code, $reuse) = genCode($op, $gram, $capData);
+            $instructs[$i]{code} = $code;
+
+            if ($gram->{type}{reuse})
+                { $instructs[$i]{caps} = $capData; }
+            else
+                { $ctrl->{reuse}[($i & 7) - 1] = $reuse; }
+
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed);
+    foreach my $i (0 .. $#instructs)
+    {
+        if ($i & 7)
+        {
+            push @codes, $instructs[$i]{code};
+            my $code_dec= $instructs[$i]{code};
+            my $code_hex = sprintf("0x%x", $code_dec);
+
+            if ($instructs[$i]{caps})
+            {
+                registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 7) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn);
+            }
+            if ($instructs[$i]{inst} =~ m'EXIT')
+            {
+                push @exitOffsets, (scalar(@codes)-1)*8;
+            }
+            elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)')
+            {
+                push @ctaidOffsets, (scalar(@codes)-1)*8;
+                $ctaidzUsed = 1 if $1 eq 'Z';
+            }
+        }
+        else
+        {
+            my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)};
+            push @codes,
+                ($ctrl->[0] <<  2) | ($ctrl->[1] << 10) | ($ctrl->[2] << 18) | # ctrl codes
+                ($ctrl->[3] << 26) | ($ctrl->[4] << 34) | ($ctrl->[5] << 42) |
+                ($ctrl->[6] << 50) | (0x0800000000000000);  # reuse codes
+        }
+    }
+
+    return {
+        RegCnt       => $regCnt,
+        BarCnt       => $barCnt,
+        ExitOffsets  => \@exitOffsets,
+        CTAIDOffsets => \@ctaidOffsets,
+        CTAIDZUsed   => $ctaidzUsed,
+        ConflictCnt  => $reuseHistory{conflicts},
+        ReuseCnt     => $reuseHistory{reuse},
+        ReuseTot     => $reuseHistory{total},
+        ReusePct     => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0),
+        KernelData   => \@codes,
+    };
+}
+
+sub Test
+{
+    my ($fh, $printConflicts, $all) = @_;
+
+    my @instructs;
+    my %reuseHistory;
+    my ($pass, $fail) = (0,0);
+
+    while (my $line = <$fh>)
+    {
+        my (@ctrl, @reuse);
+
+        next unless processSassCtrlLine($line, \@ctrl, \@reuse);
+
+        foreach my $fileReuse (@reuse)
+        {
+            $line = <$fh>;
+
+            my $inst = processSassLine($line) or next;
+
+            $inst->{reuse} = $fileReuse;
+            my $fileCode = $inst->{code};
+
+            if (exists $relOffset{$inst->{op}})
+            {
+                $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e;
+            }
+
+            my $match = 0;
+            foreach my $gram (@{$grammar{$inst->{op}}})
+            {
+                my $capData = parseInstruct($inst->{inst}, $gram) or next;
+                my @caps;
+
+                my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps);
+
+                registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse};
+
+                $inst->{caps}      = join ', ', sort @caps;
+                $inst->{codeDiff}  = $fileCode  ^ $code;
+                $inst->{reuseDiff} = $fileReuse ^ $reuse;
+
+                if ($code == $fileCode && $reuse == $fileReuse)
+                {
+                    $inst->{grade} = 'PASS';
+                    push @instructs, $inst if $all;
+                    $pass++;
+                }
+                else
+                {
+                    $inst->{grade} = 'FAIL';
+                    push @instructs, $inst;
+                    $fail++;
+                }
+                $match = 1;
+                last;
+            }
+            unless ($match)
+            {
+                $inst->{grade}     = 'FAIL';
+                $inst->{codeDiff}  = $fileCode;
+                $inst->{reuseDiff} = $fileReuse;
+                push @instructs, $inst;
+                $fail++;
+            }
+        }
+    }
+    my %maxLen;
+    foreach (@instructs)
+    {
+        $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}};
+    }
+    my ($lastOp, $template);
+    foreach my $inst (sort {
+        $a->{op}        cmp $b->{op}        ||
+        $a->{codeDiff}  <=> $b->{codeDiff}  ||
+        $a->{reuseDiff} <=> $b->{reuseDiff} ||
+        $a->{ins}       cmp $b->{ins}
+        } @instructs)
+    {
+        if ($lastOp ne $inst->{op})
+        {
+            $lastOp   = $inst->{op};
+            $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s   %s\n";
+            printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s   %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures);
+        }
+        printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)};
+    }
+    my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0;
+
+    printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n",
+        $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total};
+
+    return $fail;
+}
+
+sub Extract
+{
+    my ($in, $out, $params) = @_;
+
+    my %paramMap;
+    my %constants =
+    (
+        blockDimX => 'c[0x0][0x28]',
+        blockDimY => 'c[0x0][0x2c]',
+        blockDimZ => 'c[0x0][0x30]',
+        gridDimX  => 'c[0x0][0x34]',
+        gridDimY  => 'c[0x0][0x38]',
+        gridDimZ  => 'c[0x0][0x3c]',
+    );
+    print $out "<CONSTANT_MAPPING>\n";
+
+    foreach my $const (sort keys %constants)
+    {
+        print $out "    $const : $constants{$const}\n";
+        $paramMap{$constants{$const}} = $const;
+    }
+    print $out "\n";
+
+    foreach my $p (@$params)
+    {
+        my ($ord,$offset,$size,$align) = split ':', $p;
+
+        if ($size > 4)
+        {
+            my $num = 0;
+            $offset = hex $offset;
+            while ($size > 0)
+            {
+                my $param = sprintf 'param_%d[%d]', $ord, $num;
+                my $const = sprintf 'c[0x0][0x%x]', $offset;
+                $paramMap{$const} = $param;
+                print $out "    $param : $const\n";
+                $size   -= 4;
+                $offset += 4;
+                $num    += 1;
+            }
+        }
+        else
+        {
+            my $param = sprintf 'param_%d', $ord;
+            my $const = sprintf 'c[0x0][%s]', $offset;
+            $paramMap{$const} = $param;
+            print $out "    $param : $const\n";
+        }
+    }
+    print $out "</CONSTANT_MAPPING>\n\n";
+
+    my %labels;
+    my $labelnum = 1;
+
+    my @data;
+    FILE: while (my $line = <$in>)
+    {
+        my (@ctrl, @ruse);
+        next unless processSassCtrlLine($line, \@ctrl, \@ruse);
+
+        CTRL: foreach my $ctrl (@ctrl)
+        {
+            $line = <$in>;
+
+            my $inst = processSassLine($line) or next CTRL;
+
+            if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)')
+            {
+                my $target = hex($1);
+
+                last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num}|| $target == $inst->{num}-8);
+
+                my $label = $labels{$target};
+                unless ($label)
+                {
+                    $label = $labels{$target} = "TARGET$labelnum";
+                    $labelnum++;
+                }
+                $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/;
+            }
+            $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg;
+
+            $inst->{ctrl} = printCtrl($ctrl);
+
+            push @data, $inst;
+        }
+    }
+    foreach my $inst (@data)
+    {
+        print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}};
+        printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)};
+    }
+}
+
+my $CommentRe  = qr'^[\t ]*<COMMENT>.*?^\s*</COMMENT>\n?'ms;
+my $IncludeRe  = qr'^[\t ]*<INCLUDE\s+file="([^"]+)"\s*/?>\n?'ms;
+my $CodeRe     = qr'^[\t ]*<CODE(\d*)>(.*?)^\s*<\/CODE\1>\n?'ms;
+my $ConstMapRe = qr'^[\t ]*<CONSTANT_MAPPING>(.*?)^\s*</CONSTANT_MAPPING>\n?'ms;
+my $RegMapRe   = qr'^[\t ]*<REGISTER_MAPPING>(.*?)^\s*</REGISTER_MAPPING>\n?'ms;
+my $ScheduleRe = qr'^[\t ]*<SCHEDULE_BLOCK>(.*?)^\s*</SCHEDULE_BLOCK>\n?'ms;
+my $InlineRe   = qr'\[(\+|\-)(.+?)\1\]'ms;
+
+sub IncludeFile
+{
+    my ($file, $include) = @_;
+    my ($vol,$dir,$name) = File::Spec->splitpath($file);
+    local $/;
+    my $fh;
+    if (!open $fh, $file)
+    {
+        open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n";
+    }
+    my $content = <$fh>;
+    close $fh;
+    return $content;
+}
+
+sub Preprocess
+{
+    my ($file, $include, $debug, $regMap) = @_;
+
+    my $constMap = {};
+    my $removeRegMap;
+    if ($regMap)
+        { $removeRegMap = 1; }
+    else
+        { $regMap = {}; }
+
+    1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg;
+
+    $file =~ s|$CommentRe||g;
+
+    1 while $file =~ s|$CodeRe|
+        my $out = eval "package KeplerAs::KeplerAs::CODE; $2";
+        $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg;
+
+    $file =~ s|$InlineRe|
+        my ($type, $code) = ($1, $2);
+        my $out = eval "package KeplerAs::KeplerAs::CODE; $code";
+        $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg;
+
+    $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg;
+
+    my @newFile;
+    foreach my $line (split "\n", $file)
+    {
+        if ($line !~ m'^\s*(?:#|//).*')
+        {
+            $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg;
+        }
+        push @newFile, $line;
+    }
+    $file = join "\n", @newFile;
+
+    $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg;
+
+    my @schedBlocks = $file =~ /$ScheduleRe/g;
+
+    foreach my $i (0 .. $#schedBlocks)
+    {
+        $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]);
+
+        $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug);
+    }
+
+    $file =~ s|$ScheduleRe| shift @schedBlocks |eg;
+
+    return $file;
+}
+
+my %srcReg   = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X);
+my %destReg  = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC);
+my %regops   = (%srcReg, %destReg);
+my @itypes   = qw(class lat rlat tput dual);
+
+sub Scheduler
+{
+    my ($block, $blockNum, $regMap, $debug) = @_;
+
+    my $vectors = $regMap->{__vectors};
+    my $lineNum = 0;
+
+    my (@instructs, @comments, $ordered, $first);
+    foreach my $line (split "\n", $block)
+    {
+        $lineNum++;
+
+        unless (preProcessLine($line))
+        {
+            push @comments, $line if $line =~ m'\S';
+            next;
+        }
+
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            $inst->{first}   = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1;
+
+            $inst->{exeTime} = 0;
+            $inst->{order}   = $ordered++ if $ordered;
+            push @instructs, $inst;
+        }
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n";
+        }
+        elsif ($line =~ m'^<ORDERED>')
+        {
+            die "you cannot use nested <ORDERED> tags" if $ordered;
+            $ordered = 1;
+        }
+        elsif ($line =~ m'^</ORDERED>')
+        {
+            die "missing opening <ORDERED> for closing </ORDERED> tag" if !$ordered;
+            $ordered = 0;
+        }
+        else
+        {
+            die "badly formed line at block: $blockNum line: $lineNum: $line\n";
+        }
+    }
+
+    my (%writes, %reads, @ready, @schedule, $orderedParent);
+    foreach my $instruct (@instructs)
+    {
+        my $match = 0;
+        foreach my $gram (@{$grammar{$instruct->{op}}})
+        {
+            my $capData = parseInstruct($instruct->{inst}, $gram) or next;
+            my (@dest, @src);
+
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+
+            push @src, $instruct->{predReg} if $instruct->{pred};
+
+            if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7})
+            {
+                my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src;
+                my $mask = hex($capData->{i20w7});
+                foreach my $p (0..6)
+                {
+                    if ($mask & (1 << $p))
+                    {
+                        push @$list, "P$p";
+                    }
+                    elsif ($instruct->{op} eq 'R2P')
+                    {
+                        push @src, "P$p";
+                    }
+                }
+            }
+
+            foreach my $operand (grep { exists $regops{$_} } sort keys %$capData)
+            {
+                my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src;
+
+                my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT';
+
+                if ($capData->{$operand} ne $badVal)
+                {
+                    push @$list,
+                        $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) :
+                        $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) :
+                        $operand eq 'CC' ? 'CC' :
+                        $operand eq 'X'  ? 'CC' :
+                        getRegNum($regMap, $capData->{$operand});
+                }
+            }
+            $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39});
+
+            foreach my $src (grep { exists $writes{$_} } @src)
+            {
+                my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat};
+
+                foreach my $parent (@{$writes{$src}})
+                {
+                    my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat};
+                    push @{$parent->{children}}, [$instruct, $latency - $regLatency];
+                    $instruct->{parents}++;
+
+                    last unless $parent->{pred};
+                }
+            }
+
+            foreach my $dest (grep { exists $reads{$_} } @dest)
+            {
+                foreach my $reader (@{$reads{$dest}})
+                {
+                    push @{$reader->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                delete $reads{$dest} unless $instruct->{pred};
+            }
+
+            if ($instruct->{order})
+            {
+                if ($orderedParent)
+                {
+                    push @{$orderedParent->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                $orderedParent = $instruct;
+            }
+            elsif ($orderedParent)
+                {  $orderedParent = 0; }
+
+            unshift @{$writes{$_}}, $instruct foreach @dest;
+
+            push @{$reads{$_}}, $instruct foreach @src;
+
+            push @ready, $instruct if !exists $instruct->{parents};
+
+            $match = 1;
+            last;
+        }
+        die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match;
+    }
+    %writes = ();
+    %reads  = ();
+
+    if (@ready)
+    {
+        my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" };
+
+        countUniqueDescendants($readyParent, {});
+        updateDepCounts($readyParent, {});
+
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    my $clock = 0;
+    while (my $instruct = shift @ready)
+    {
+        my $stall = $instruct->{stall};
+
+        if (@schedule && $stall < 16)
+        {
+            my $prev = $schedule[$#schedule];
+
+            $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0;
+            $prev->{ctrl} |= $stall;
+            $clock += $stall;
+        }
+        else
+        {
+            $instruct->{ctrl} &= 0x1fff0;
+            $instruct->{ctrl} |= 1;
+            $clock += 1;
+        }
+        print "$clock: $instruct->{inst}\n" if $debug;
+
+        push @schedule, $instruct;
+
+        if (my $children = $instruct->{children})
+        {
+            foreach (@$children)
+            {
+                my ($child, $latency) = @$_;
+
+                my $earliest = $clock + $latency;
+                $child->{exeTime} = $earliest if $child->{exeTime} < $earliest;
+
+                print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug;
+
+                push @ready, $child if --$child->{parents} < 1;
+            }
+            delete $instruct->{children};
+        }
+
+        foreach my $ready (@ready)
+        {
+            $stall = $ready->{exeTime} - $clock;
+            $stall = 1 if $stall < 1;
+
+            if ($ready->{class} eq $instruct->{class})
+            {
+                $stall = $ready->{tput} if $stall < $ready->{tput};
+            }
+            elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 &&
+                   $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const}))
+            {
+                $stall = 0;
+            }
+            $ready->{stall} = $stall;
+
+            $ready->{mix} = $ready->{class} ne $instruct->{class} || 0;
+        }
+
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $a->{stall}   <=> $b->{stall}  ||
+            $b->{mix}     <=> $a->{mix}    ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    my $out;
+    $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule;
+    return $out;
+}
+
+sub setConstMap
+{
+    my ($constMap, $constMapText) = @_;
+
+    foreach my $line (split "\n", $constMapText)
+    {
+        $line =~ s|^\s+||;
+        $line =~ s{(?:#|//).*}{};
+        $line =~ s|\s+$||;
+        next unless $line =~ m'\S';
+
+        my ($name, $value) = split '\s*:\s*', $line;
+
+        $constMap->{$name} = $value;
+    }
+    return;
+}
+
+sub setRegisterMap
+{
+    my ($regMap, $regmapText) = @_;
+
+    my $vectors = $regMap->{__vectors} ||= {};
+    my $regBank = $regMap->{__regbank} ||= {};
+    my %aliases;
+
+    foreach my $line (split "\n", $regmapText)
+    {
+        $line =~ s|^\s+||;
+        $line =~ s{(?:#|//).*}{};
+        $line =~ s|\s+$||;
+        next unless $line =~ m'\S';
+
+        my $auto  = $line =~ /~/;
+        my $share = $line =~ /=/;
+
+        my ($regNums, $regNames) = split '\s*[:~=]\s*', $line;
+
+        my (@numList, @nameList, %vecAliases);
+        foreach my $num (split '\s*,\s*', $regNums)
+        {
+            my ($start, $stop) = split '\s*\-\s*', $num;
+            die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop;
+            push @numList, ($start .. $stop||$start);
+        }
+        foreach my $fullName (split '\s*,\s*', $regNames)
+        {
+            if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$')
+            {
+                my ($name1, $name2, $bank) = ($1, $3, $4);
+                foreach (split '\s*\|\s*', $2)
+                {
+                    my ($start, $stop) = split '\s*\-\s*';
+                    foreach my $r (map "$name1$_$name2", $start .. $stop||$start)
+                    {
+                        $aliases{$r} = "$name1$name2" unless exists $aliases{$r};
+                        push @nameList, $r;
+                        $regBank->{$r} = $bank if $auto && defined $bank;
+                        warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank;
+                    }
+                }
+            }
+            elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$')
+            {
+                push @nameList, $1;
+                $regBank->{$1} = $2 if $auto && defined $2;
+                warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2;
+            }
+            else
+            {
+                die "Bad register name: '$fullName' at: $line\n";
+            }
+        }
+        die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList;
+        die "Missmatched register mapping at: $line\n" if $share && @numList > 1;
+
+        my $i = 0;
+        while ($i < $#numList-1)
+        {
+            last if $numList[$i] + 1 != $numList[$i+1];
+            $i++;
+        }
+        my $ascending = $i+1 == $#numList;
+
+        foreach my $n (0..$#nameList)
+        {
+            die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]};
+
+            if ($auto)
+            {
+                $regMap->{$nameList[$n]} = \@numList;
+            }
+            elsif ($share)
+            {
+                $regMap->{$nameList[$n]} = 'R' . $numList[0];
+            }
+            else
+            {
+                $regMap->{$nameList[$n]} = 'R' . $numList[$n];
+                if ($ascending && ($numList[$n] & 1) == 0)
+                {
+                    my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3);
+                    if ($end <= $#nameList)
+                    {
+                        $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ];
+                        if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}})
+                        {
+                            $regMap->{$aliases{$nameList[$n]}}  = $regMap->{$nameList[$n]};
+                            $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]};
+                            delete $aliases{$nameList[$n]};
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+sub preProcessLine
+{
+    $_[0] =~ s|^\s+||;
+
+    my $val = shift;
+
+    $val =~ s{(?:#|//).*}{};
+
+    return $val =~ m'\S';
+}
+
+sub countUniqueDescendants
+{
+    my ($node, $edges) = @_;
+
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+
+            $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges);
+        }
+    }
+    else
+    {
+        return $node->{lineNum};
+    }
+    return ($node->{lineNum}, keys %{$node->{deps}});
+}
+sub updateDepCounts
+{
+    my ($node, $edges) = @_;
+
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (@$children)
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+            updateDepCounts($child->[0], $edges);
+        }
+    }
+    $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0;
+}
+
+sub registerHealth
+{
+    my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_;
+
+    my (@banks, @conflicts);
+
+    foreach my $slot (qw(r8 r20 r39))
+    {
+        my $r = $capData->{$slot} or next;
+        next if $r eq 'RZ';
+
+        my $slotHist = $reuseHistory->{$slot} ||= {};
+
+        $reuseHistory->{total}++;
+
+        if (exists $slotHist->{$r})
+        {
+            $reuseHistory->{reuse}++;
+        }
+        else
+        {
+            my $bank = substr($r,1) & 7;
+
+            if ($banks[$bank] && $banks[$bank] ne $r)
+            {
+                push @conflicts, $banks[$bank] if !@conflicts;
+                push @conflicts, $r;
+
+                $reuseHistory->{conflicts}++;
+            }
+            $banks[$bank] = $r;
+        }
+
+        if ($reuseFlags & $reuseSlots{$slot})
+            { $slotHist->{$r} = 1; }
+        else
+            { delete $slotHist->{$r};  }
+    }
+    if ($inst && @conflicts && !$nowarn)
+    {
+        printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts);
+    }
+    return scalar @conflicts;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+KeplerAs::KeplerAs - Assembler for NVIDIA Maxwell architecture
+
+=head1 SYNOPSIS
+
+    KeplerAs.pl [opts]
+
+=head1 DESCRIPTION
+
+See the documentation at: https://github.com/NervanaSystems/KeplerAs
+
+=head1 SEE ALSO
+
+See the documentation at: https://github.com/NervanaSystems/KeplerAs
+
+
+=head1 AUTHOR
+
+Scott Gray, E<lt>sgray@nervanasys.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+=cut
diff --git a/Assembler/KeplerAs/lib/KeplerAs/KeplerAsGrammar.pm b/Assembler/KeplerAs/lib/KeplerAs/KeplerAsGrammar.pm
new file mode 100644
index 0000000..d372ea3
--- /dev/null
+++ b/Assembler/KeplerAs/lib/KeplerAs/KeplerAsGrammar.pm
@@ -0,0 +1,1659 @@
+package KeplerAs::KeplerAsGrammar;
+
+use strict;
+use Carp;
+use Exporter;
+use Data::Dumper;
+our @ISA = qw(Exporter);
+
+our @EXPORT = qw(
+    %grammar %flags
+    parseInstruct genCode genReuseCode
+    processAsmLine processSassLine processSassCtrlLine
+    replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters
+);
+
+require 5.10.0;
+
+sub getI
+{
+    my ($orig, $pos, $mask) = @_;
+    my $val = $orig;
+    my $neg = $val =~ s|^\-||;
+
+    if ($val  =~ m'^(\d+)[xX]<([^>]+)>')
+    {
+        my $mul = $1;
+        my $exp = $2;
+        $exp =~ s/(?<!\d)0+(?=[1-9])//g;
+        my @globals = $exp =~ m'\$\w+'g;
+        my $our = @globals ? ' our (' . join(',',@globals) . ');' : '';
+        $val = $mul * eval "package KeplerAs::KeplerAs::CODE;$our $exp";
+    }
+    elsif ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+
+    if ( $neg )
+    {
+        $val = -$val;
+        $val &= $mask;
+    }
+    if (($val & $mask) != $val)
+    {
+        die sprintf "Immediate value out of range(0x%x): 0x%x ($orig)\n", $mask, $val;
+    }
+    return $val << $pos;
+}
+sub getF
+{
+    my ($val, $pos, $type, $trunc) = @_;
+    if ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    elsif ($val =~ m'INF'i)
+    {
+        $val = $trunc ? ($type eq 'f' ? 0x7f800 : 0x7ff00) : 0x7f800000;
+    }
+    else
+    {
+        $val = unpack(($type eq 'f' ? 'L' : 'Q'), pack $type, $val);
+
+        $val = ($val >> $trunc) & 0x7ffff if $trunc;
+    }
+    return $val << $pos;
+}
+sub getR
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^R(\d+|Z)$' && $1 < 255)
+    {
+        $val = $1 eq 'Z' ? 0xff : $1;
+    }
+    else
+    {
+        die "Bad register name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getP
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^P(\d|T)$' && $1 < 7)
+    {
+        $val = $1 eq 'T' ? 7 : $1;
+    }
+    else
+    {
+        die "Bad predicate name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getC { ((hex($_[0]) >> 2) & 0x3fff) << 23 }
+
+my %operands =
+(
+    p0      => sub { getP($_[0], 2)  },
+    p3      => sub { getP($_[0], 5)  },
+    p12     => sub { getP($_[0], 14) },
+    p29     => sub { getP($_[0], 32) },
+    p39     => sub { getP($_[0], 42) },
+    p45     => sub { getP($_[0], 48) },
+    p48     => sub { getP($_[0], 51) },
+    p58     => sub { getP($_[0], 58) },
+    r0      => sub { getR($_[0], 2)  },
+    r8      => sub { getR($_[0], 10)  },
+    r20     => sub { getR($_[0], 23) },
+    r28     => sub { getR($_[0], 28) },
+    r39s20  => sub { getR($_[0], 42) },
+    r39     => sub { getR($_[0], 42) },
+    r39a    => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20
+    c20     => sub { getC($_[0])     },
+    z20     => sub { getC($_[0])     },
+    c39     => sub { getC($_[0])     },
+    c34     => sub { hex($_[0]) << 37 },
+    c36     => sub { hex($_[0]) << 39 },
+    f20w32  => sub { getF($_[0], 23, 'f')        },
+    f20     => sub { getF($_[0], 23, 'f', 12)    },
+    d20     => sub { getF($_[0], 23, 'd', 44)    },
+    i8w4    => sub { getI($_[0], 10,  0xf)        },
+    i20     => sub { getI($_[0], 23, 0x7ffff)    },
+    i20w6   => sub { getI($_[0], 23, 0x3f)       },
+    i20w7   => sub { getI($_[0], 23, 0x7f)       },
+    i20w8   => sub { getI($_[0], 23, 0xff)       },
+    i20w12  => sub { getI($_[0], 23, 0xfff)      },
+    i20w24  => sub { getI($_[0], 23, 0xffffff)   },
+    i20w32  => sub { getI($_[0], 23, 0xffffffff) },
+    i31w4   => sub { getI($_[0], 34, 0xf)        },
+    i34w13  => sub { getI($_[0], 37, 0x1fff)     },
+    i36w20  => sub { getI($_[0], 36, 0xfffff)    },
+    i39w8   => sub { getI($_[0], 42, 0x1f)       },
+    i28w8   => sub { getI($_[0], 28, 0xff)       },
+    i28w20  => sub { getI($_[0], 31, 0xfffff)    },
+    i48w8   => sub { getI($_[0], 48, 0xff)       },
+    i51w5   => sub { getI($_[0], 51, 0x1f)       },
+    i53w5   => sub { getI($_[0], 53, 0x1f)       },
+    i23w6  => sub { getI($_[0], 23, 0x3f)      },
+);
+
+my $hex     = qr"0[xX][0-9a-fA-F]+";
+my $iAddr   = qr"\d+[xX]<[^>]+>";
+my $immed   = qr"$hex|$iAddr|\d+"o;
+my $reg     = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\
+my $p       = qr"P[0-6T]";
+my $noPred  = qr"(?<noPred>)";
+my $pred    = qr"\@(?<predNot>\!)?P(?<predNum>[0-6]) ";
+my $p0      = qr"(?<p0>$p)"o;
+my $p3      = qr"(?<p3>$p)"o;
+my $p12     = qr"(?<p12not>\!)?(?<p12>$p)"o;
+my $p29     = qr"(?<p29not>\!)?(?<p29>$p)"o;
+my $p39     = qr"(?<p39not>\!)?(?<p39>$p)"o;
+my $p45     = qr"(?<p45>$p)"o;
+my $p48     = qr"(?<p48>$p)"o;
+my $p58     = qr"(?<p58>$p)"o;
+my $r0      = qr"(?<r0>$reg)";
+my $r0cc    = qr"(?<r0>$reg)(?<CC>\.CC)?";
+my $r8      = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse1>\.reuse)?";
+my $r20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse2>\.reuse)?";
+my $r28     = qr"(?<r28>$reg)";
+my $r39s20  = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r39s20>(?<r20>$reg))\|?(?:\.(?<r39part>H0|H1))?(?<reuse2>\.reuse)?";
+my $r39     = qr"(?<r39neg>\-)?(?<r39>$reg)(?:\.(?<r39part>H0|H1))?(?<reuse3>\.reuse)?";
+my $r39a    = qr"(?<r39a>(?<r39>$reg))(?<reuse3>\.reuse)?";
+my $c20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3))?"o;
+my $c20x    = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20partx>H0|H1|B0|B1|B2|B3))?"o;
+my $c20s39  = qr"(?<r39neg>\-)?c\[(?<c34>$hex)\]\s*\[(?<c39>$hex)\]"o;
+my $f20w32  = qr"(?<f20w32>(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))";
+my $f20     = qr"(?<f20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $d20     = qr"(?<d20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $i8w4    = qr"(?<i8w4>$immed)"o;
+my $i20     = qr"(?<i20>(?<neg>\-)?$immed)(?<r20neg>\.NEG)?"o;
+my $i20w6   = qr"(?<i20w6>$immed)"o;
+my $i20w7   = qr"(?<i20w7>$immed)"o;
+my $i20w8   = qr"(?<i20w8>$immed)"o;
+my $i20w12  = qr"(?<i20w12>$immed)"o;
+my $i20w24  = qr"(?<i20w24>\-?$immed)"o;
+my $i20w32  = qr"(?<i20w32>\-?$immed)"o;
+my $i39w8   = qr"(?<i39w8>\-?$immed)"o;
+my $i28w8   = qr"(?<i28w8>$immed)"o;
+my $i28w20  = qr"(?<i28w20>\-?$immed)"o;
+my $i31w4   = qr"(?<i31w4>$immed)"o;
+my $i34w13  = qr"(?<i34w13>$immed)"o;
+my $i36w20  = qr"(?<i36w20>$immed)"o;
+my $i48w8   = qr"(?<i48w8>$immed)"o;
+my $i51w5   = qr"(?<i51w5>$immed)"o;
+my $i53w5   = qr"(?<i53w5>$immed)"o;
+my $i23w6   = qr"(?<i23w6>$immed)"o;
+my $ir20    = qr"$i20|$r20"o;
+my $cr20    = qr"$c20|$r20"o;
+my $icr20   = qr"$i20|$c20|$r20"o;
+my $fcr20   = qr"$f20|$c20|$r20"o;
+my $cr39    = qr"$c20s39|$r39"o;
+my $dr20    = qr"$d20|$r20"o;
+
+my $u32   = qr"(?<U32>\.U32)?";
+my $REV2B = qr"(?<REV2B>\.REV2B)?";
+my $W     = qr"(?<W>\.W)?";
+my $pnot2d= qr"(?<PNOT2D>\.PNOT2D)?";
+my $ftz   = qr"(?<FTZ>\.FTZ)?";
+my $sat   = qr"(?<SAT>\.SAT)?";
+my $rnd   = qr"(?:\.(?<rnd>RN|RM|RP|RZ))?";
+my $mulf  = qr"(?:\.(?<mulf>D2|D4|D8|M8|M4|M2))?";
+my $condition  = qr"(?:(?<CON>F|LT|EQ|LE|GT|NE|GE|NUM|NAN|LTU|EQU|LEU|GTU|NEU|GEU|OFF|LO|SFF|LS|HI|SFT|HS|OFT))?";
+my $lane2a= qr"(?:\.(?<lane2a>LNONE|L0|L1|L01|L2|L02|L12|L012|L3|L03|L13|L013|L23|L023|L123))?";
+my $lane0e= qr"(?:\.(?<lane0e>LNONE|L0|L1|L01|L2|L02|L12|L012|L3|L03|L13|L013|L23|L023|L123))?";
+
+
+my $round = qr"(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?";
+my $fcmp  = qr"(?<cmp>\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)";
+my $icmp  = qr"\.(?<cmp>LT|EQ|LE|GT|NE|GE)";
+my $bool  = qr"\.(?<bool>AND|OR|XOR|PASS_B)";
+my $bool2 = qr"\.(?<bool2>AND|OR|XOR)";
+my $func  = qr"\.(?<func>COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)";
+my $rro   = qr"\.(?<func>SINCOS|EX2)";
+my $add3  = qr"(?:\.(?<type>X|RS|LS))?";
+my $lopz  = qr"(?:\.(?<z>NZ|Z) $p48,|(?<noz>))"o;
+my $X     = qr"(?<X>\.X)?";
+my $PO     = qr"(?<PO>\.PO)?";
+my $bf     = qr"(?<BF>\.BF)?";
+my $S     = qr"(?<S>\.S)?";
+my $tld   = qr"(?<NODEP>NODEP\.)?(?:(?<reuse1>T)|(?<reuse2>P))";
+my $chnls = qr"(?<chnls>R|RGBA)";
+my $sr    = qr"SR_(?<sr>\S+)";
+my $shf   = qr"(?<W>\.W)?(?:\.(?<type>U64|S64))?(?<HI>\.HI)?";
+my $imad  = qr"(?:\.(?<type1>U32|S32))?(?:\.(?<type2>U32|S32))?(?:\.(?<mode>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $imadc = qr"(?:\.(?<type1>U32|S32))?(?:\.(?<type2>U32|S32))?(?:\.(?<modec>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $imul  = qr"(?:\.(?<type1>U32|S32))?(?:\.(?<type2>U32|S32))?";
+my $vmad8 = qr"\.(?<sign1>[SU])(?<size1>8|16)\.(?<sign2>[SU])(?<size2>8|16)(?<PO>\.PO)?(?<SHR_7>\.SHR_7)?(?<SHR_15>\.SHR_15)?(?<SAT>\.SAT)?";
+my $vmad16= qr"\.(?<sign1>[SU])(?<size1>16)\.(?<sign2>[SU])(?<size2>16)";
+my $hilo  = qr"(?:\.(?<mode>XHI|XLO))?";
+my $hi  = qr"(?:\.(?<mode>HI))?";
+my $vaddType = qr"(?:\.(?<UD>UD))?(?:\.(?<SD>SD))?(?:\.(?<sign1>[SU])(?<size1>8|16|32))?(?:\.(?<sign2>[SU])(?<size2>8|16|32))?";
+my $vaddMode = qr"(?:\.(?<mode>MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?";
+my $vmnmx = qr"(?:\.(?<MX>MX))?";
+my $x2x   = qr"\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)";
+my $prmt  = qr"(?:\.(?<mode>F4E|B4E|RC8|ECL|ECR|RC16))?";
+my $shfl  = qr"\.(?<mode>IDX|UP|DOWN|BFLY)";
+my $bar   = qr"\.(?<mode>SYNC|ARV|RED)(?:\.(?<red>POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?(<r20>)|(?<nor20>))(?(<red>), $p39|(?<nop39>))"o;
+my $b2r   = qr"\.RESULT $r0(?:, $p45|(?<nop45>))"o;
+my $dbar  = qr"(?<SB>SB0|SB1|SB2|SB3|SB4|SB5)";
+my $dbar2 = qr" {(?<db5>5)?,?(?<db4>4)?,?(?<db3>3)?,?(?<db2>2)?,?(?<db1>1)?,?(?<db0>0)?}";
+my $mbar  = qr"\.(?<mode>CTA|GL|SYS)";
+my $addr  = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i20w24)?\]"o;
+my $addr2 = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i28w20)?\]"o;
+my $ldc   = qr"c\[(?<c36>$hex)\]\s*$addr"o;
+my $atom  = qr"(?<E>\.E)?(?:\.(?<mode>ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?<type>|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)";
+my $vote  = qr"\.(?<mode>ALL|ANY|EQ)"o;
+my $memType  = qr"(?<type>\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)";
+my $memTypeX  = qr"(?<type>\.b32|\.b64|\.b96|\.b128)";
+my $memCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|CI|CS|CV|IL|WT|LU))?";
+my $ldmemCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|LU|CV))?";
+my $stmemCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|CS|WT))?";
+
+
+
+
+my $s2rT  = {class => 's2r',   lat => 2,   blat => 25,  rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $smemT = {class => 'mem',   lat => 2,   blat => 30,  rlat => 2, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $gmemT = {class => 'mem',   lat => 2,   blat => 200, rlat => 4, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $x32T  = {class => 'x32',   lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 1};
+my $x64T  = {class => 'x64',   lat => 2,   blat => 128, rlat => 0, rhold => 0,  tput => 128, dual => 0, reuse => 1};
+my $shftT = {class => 'shift', lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $cmpT  = {class => 'cmp',   lat => 13,  blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $qtrT  = {class => 'qtr',   lat => 8,   blat => 0,   rlat => 4, rhold => 0,  tput => 1,   dual => 1, reuse => 0};
+my $rroT  = {class => 'rro',   lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $voteT = {class => 'vote',  lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+
+
+our %grammar =
+(
+    FADD     => [
+    { type => $x32T,  code => 0xe2c0000000000002, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $cr20;"o,               },
+    { type => $x32T,  code => 0xc2c0000000000001, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $f20;"o,               },
+    ],
+    FADD32I  => [ { type => $x32T,  code => 0x4000000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FCHK     => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o,                     } ], #Partial?
+    FCMP     => [
+    { type => $cmpT,  code => 0xdd00000000000002, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $cr20, $r39;"o,            },
+    { type => $cmpT,  code => 0xdd00000000000002, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $r39s20, $c20s39;"o,            },
+    { type => $cmpT,  code => 0xb500000000000001, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $f20, $r39;"o,            },
+    ],
+    FFMA     => [
+                  { type => $x32T,  code => 0xcc00000000000002, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $cr20, $r39;"o,         },
+                  { type => $x32T,  code => 0xcc00000000000002, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o,     },
+                  { type => $x32T,  code => 0x9400000000000001, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $f20, $r39;"o,     },
+                ],
+    FMNMX    => [
+    { type => $shftT, code => 0xe300000000000002, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $cr20, $p39;"o,                },
+    { type => $shftT, code => 0xc300000000000001, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $f20, $p39;"o,                },
+    ],
+    FMUL     => [
+    { type => $x32T,  code => 0xe340000000000002, rule => qr"^$pred?FMUL$ftz$rnd$sat$mulf $r0, $r8, $cr20;"o,               },
+    { type => $x32T,  code => 0xc340000000000001, rule => qr"^$pred?FMUL$ftz$rnd$sat$mulf $r0, $r8, $f20;"o,               },
+    ],
+    FMUL32I  => [ { type => $x32T,  code => 0x2000000000000002, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FSET     => [
+    { type => $shftT, code => 0xc000000000000002, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $cr20, $p39;"o,       },
+    { type => $shftT, code => 0x8000000000000001, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $f20, $p39;"o,       },
+    ],
+    FSETP    => [ { type => $cmpT,  code => 0xdd80000000000002, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ],
+    MUFU     => [ { type => $qtrT,  code => 0x8400000000000002, rule => qr"^$pred?MUFU$func $r0, $r8;"o,                              } ],
+    RRO      => [ { type => $rroT,  code => 0xe480000000000002, rule => qr"^$pred?RRO$rro $r0, $r20;"o,                               } ],
+    DADD     => [
+    { type => $x64T,  code => 0xe380000000000002, rule => qr"^$pred?DADD$rnd $r0, $r8, $cr20;"o,                        },
+    { type => $x64T,  code => 0xc380000000000001, rule => qr"^$pred?DADD$rnd $r0, $r8, $d20;"o,                        },
+    ],
+    DFMA     => [
+    { type => $x64T,  code => 0xdb80000000000002, rule => qr"^$pred?DFMA$rnd $r0, $r8, $cr20, $r39;"o,                  },
+    { type => $x64T,  code => 0xdb80000000000002, rule => qr"^$pred?DFMA$rnd $r0, $r8, $d20, $r39;"o,                  },
+    ],
+    DMNMX    => [
+    { type => $cmpT,  code => 0xe280000000000002, rule => qr"^$pred?DMNMX $r0, $r8, $cr20, $p39;"o,                     },
+    { type => $cmpT,  code => 0xe280000000000002, rule => qr"^$pred?DMNMX $r0, $r8, $d20, $p39;"o,                     },
+    ],
+    DMUL     => [
+    { type => $x64T,  code => 0xe400000000000002, rule => qr"^$pred?DMUL$rnd $r0, $r8, $cr20;"o,                        },
+    { type => $x64T,  code => 0xc400000000000001, rule => qr"^$pred?DMUL$rnd $r0, $r8, $d20;"o,                        },
+    ],
+    DSET     => [ { type => $cmpT,  code => 0xc800000000000002, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o,            } ],
+    DSETP    => [ { type => $cmpT,  code => 0xdc00000000000002, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o,      } ],
+    FSWZADD  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o,                                    } ], #TODO
+
+    HADD2     => [ { type => $x32T,  code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o,               } ],
+    HMUL2     => [ { type => $x32T,  code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o,               } ],
+    HFMA2     => [ { type => $x32T,  code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o,         } ],
+    HSETP2    => [ { type => $cmpT,  code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial
+
+    BFE       => [
+    { type => $shftT,  code => 0xe008000000000002, rule => qr"^$pred?BFE$u32$REV2B $r0, $r8, $cr20;"o,                          },
+    { type => $shftT,  code => 0xc008000000000001, rule => qr"^$pred?BFE$u32$REV2B $r0, $r8, $ir20;"o,                          },
+    ],
+    BFI       => [
+    { type => $shftT,  code => 0xdf80000000000002, rule => qr"^$pred?BFI$S $r0, $r8, $r20, $cr39;"o,                        },
+    { type => $shftT,  code => 0xb780000000000001, rule => qr"^$pred?BFI$S $r0, $r8, $i20, $cr39;"o,                        },
+    ],
+    FLO       => [ { type => $s2rT,   code => 0xe180000000000002, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o,                              } ],
+    IADD      => [
+    { type => $x32T,   code => 0xe080000000000002, rule => qr"^$pred?IADD$S$PO$sat$X $r0cc, $r8, $cr20;"o,                         },
+    { type => $x32T,   code => 0xc080000000000001, rule => qr"^$pred?IADD$S$PO$sat$X $r0cc, $r8, $i20;"o,                         },
+    ],
+
+    ISUB      => [
+    { type => $x32T,   code => 0xe088000000000002, rule => qr"^$pred?ISUB$sat$X $r0cc, $r8, $cr20;"o,                         },
+    { type => $x32T,   code => 0xc088000000000001, rule => qr"^$pred?ISUB$sat$X $r0cc, $r8, $i20;"o,                         },
+    { type => $x32T,   code => 0xc090000000000001, rule => qr"^$pred?ISUB$sat$X $r0cc, $i20, $r8;"o,                         },
+    ],
+
+
+
+    IADD32I   => [ { type => $x32T,   code => 0x4000000000000001, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o,                         } ],
+    ICMP      => [
+    { type => $cmpT,   code => 0xda08000000000002, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $cr20, $r39;"o,              },
+    { type => $cmpT,   code => 0xda08000000000002, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $r39s20, $c20s39;"o,              },
+    { type => $cmpT,   code => 0xb208000000000001, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $i20, $r39;"o,              },
+    ],
+    IMNMX     => [
+    { type => $shftT,  code => 0xe108000000000002, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $cr20, $p39;"o,                  },
+    { type => $shftT,  code => 0xc108000000000001, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $i20, $p39;"o,                  },
+    ],
+    ISET      => [
+    { type => $shftT,  code => 0xda88000000000002, rule => qr"^$pred?ISET$bf$icmp$u32$X$bool$S $r0, $r8, $cr20, $p39;"o,       },
+    { type => $shftT,  code => 0xb288000000000001, rule => qr"^$pred?ISET$bf$icmp$u32$X$bool$S $r0, $r8, $i20, $p39;"o,       },
+    ],
+    ISETP     => [
+    { type => $cmpT,   code => 0xdb08000000000002, rule => qr"^$pred?ISETP$icmp$u32$X$bool$S $p3, $p0, $r8, $cr20, $p39;"o, },
+    { type => $cmpT,   code => 0xb308000000000001, rule => qr"^$pred?ISETP$icmp$u32$X$bool$S $p3, $p0, $r8, $i20, $p39;"o, },
+   ],
+    ISCADD    => [
+    { type => $shftT,  code => 0xe0c0000000000002, rule => qr"^$pred?ISCADD$X $r0cc, $r8, $cr20, $i39w8;"o,                   },
+    { type => $shftT,  code => 0xc0c0000000000001, rule => qr"^$pred?ISCADD$X $r0cc, $r8, $i20, $i39w8;"o,                   }
+    ],
+    ISCADD32I => [ { type => $shftT,  code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o,               } ],
+
+    LOP       => [
+    { type => $x32T,   code => 0xe200000000000002, rule => qr"^$pred?LOP$bool$S $r0, (?<INV1>~)?$r8, (?<INV>~)?$cr20(?<INV>\.INV)?;"o, },
+    { type => $x32T,   code => 0xc200000000000001, rule => qr"^$pred?LOP$bool$S $r0, (?<INV1>~)?$r8, (?<INV>~)?$i20(?<INV>\.INV)?;"o, },
+    ],
+    LOP32I    => [ { type => $x32T,   code => 0x2000000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o,                     } ],
+    LOP3      => [
+                   { type => $x32T,   code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o,            },
+                   { type => $x32T,   code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o,            },
+                 ],
+    POPC      => [
+    { type => $s2rT,   code => 0xe040000000000002, rule => qr"^$pred?POPC $r0, $r8, $cr20;"o,                                    },
+    { type => $s2rT,   code => 0xc040000000000001, rule => qr"^$pred?POPC $r0, $r8, $i20;"o,                                    },
+    ],
+    SHF       => [
+                   { type => $shftT,  code => 0xdfc0000000000002, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $r20, $r39;"o,                  },
+                   { type => $shftT,  code => 0xb7c0000000000001, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $i20, $r39;"o,                  },
+                   { type => $shftT,  code => 0xe7c0000000000002, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $r20, $r39;"o,                  },
+                   { type => $shftT,  code => 0xc7c0000000000001, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $i20, $r39;"o,                  },
+                 ],
+    SHL       => [
+    { type => $shftT,  code => 0xe240000000000002, rule => qr"^$pred?SHL(?<W>\.W)? $r0, $r8, $cr20;"o,                    },
+    { type => $shftT,  code => 0xc240000000000001, rule => qr"^$pred?SHL(?<W>\.W)? $r0, $r8, $i23w6;"o,                    },
+    ],
+    SHR       => [
+    { type => $shftT,  code => 0xe148000000000002, rule => qr"^$pred?SHR$u32$W $r0, $r8, $cr20;"o,                          },
+    { type => $shftT,  code => 0xc148000000000001, rule => qr"^$pred?SHR$u32$W $r0, $r8, $i23w6;"o,                          },
+   ],
+IMAD      => [
+                   { type => $x32T,   code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $r20, $r39;"o,                 },
+                   { type => $x32T,   code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $r39s20, $c20s39;"o,            },
+                   { type => $x32T,   code => 0xd108000000000002, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $c20x, $r39;"o,                  },
+                   { type => $x32T,   code => 0xa108000000000001, rule => qr"^$pred?IMAD$imad$hi$X$S $r0cc, $r8, $i20, $r39;"o,                  },
+                 ],
+    IMADSP    => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO
+    IMUL      => [
+    { type => $x32T,   code => 0xe1c0180000000002, rule => qr"^$pred?IMUL$imul$hi $r0, $r8, $cr20;"o,   },
+    { type => $x32T,   code => 0xc1c0180000000001, rule => qr"^$pred?IMUL$imul$hi $r0, $r8, $i20;"o,   },
+    ],
+    IMUL32I      => [
+    { type => $x32T,   code => 0x2e00000000000002, rule => qr"^$pred?IMUL32I$imul$hi $r0, $r8, $i20w32;"o,   },
+    ],
+
+    F2F => [ { type => $qtrT,  code => 0xe540000000000002, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ],
+    F2I => [ { type => $qtrT,  code => 0xe580000000000002, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o,         } ],
+    I2F => [ { type => $qtrT,  code => 0xe5c0000000000002, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o,               } ],
+    I2I => [ { type => $qtrT,  code => 0xe600000000000002, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o,               } ],
+    F2ITRUNC => [ { type => $qtrT,  code => 0xe5800c00051ca846, rule => qr"^$pred?F2ITRUNC[^;]*;"o,               } ],
+
+    MOV    => [ { type => $x32T,  code => 0xe4c03c0000000002, rule => qr"^$pred?MOV$lane2a$S $r0, $cr20;"o,                   } ],
+    MOV32I => [ { type => $x32T,  code => 0x740000000003c002, rule => qr"^$pred?MOV32I$lane0e$S $r0, (?:$i20w32|$f20w32);"o,   } ],
+    PRMT   => [
+    { type => $x32T,  code => 0xde00000000000002, rule => qr"^$pred?PRMT$prmt $r0, $r8, $cr20, $cr39;"o, },
+    { type => $x32T,  code => 0xb600000000000001, rule => qr"^$pred?PRMT$prmt $r0, $r8, $i20, $r39;"o, },
+    ],
+    SEL    => [
+    { type => $x32T,  code => 0xe500000000000002, rule => qr"^$pred?SEL $r0, $r8, $cr20, $p39;"o,        },
+    { type => $x32T,  code => 0xc500000000000001, rule => qr"^$pred?SEL $r0, $r8, $i20, $p39;"o,        },
+    ],
+    SHFL   => [ { type => $smemT, code => 0x7880000000000002, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ],
+
+    PSET   => [ { type => $cmpT,  code => 0x8440000000000002, rule => qr"^$pred?PSET$bf$bool2$bool $r0, $p12, $p29, $p39;"o,       } ],
+    PSETP  => [ { type => $cmpT,  code => 0x8480000000000002, rule => qr"^$pred?PSETP$bool2$bool$S $p3, $p0, $p12, $p29, $p39;"o, } ],
+    CSET   => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o,  } ], #TODO
+    CSETP  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO
+    P2R    => [ { type => $x32T,  code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o,   } ],
+    R2P    => [ { type => $cmpT,  code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o,   } ],
+
+    TLD    => [ { type => $gmemT, code => 0x700a00067f9ffc02, rule => qr"^$pred?TLD[^;]*;"o, } ], #Partial
+    TLDzxx    => [ { type => $gmemT, code => 0x700a00057f9ffc02, rule => qr"^$pred?TLDzxx[^;]*;"o, } ], #Partial
+    TEXDEPBAR    => [ { type => $gmemT, code => 0x77000000001c0002, rule => qr"^$pred?TEXDEPBAR $i20w6;"o, } ], #Partial
+    TEX    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o,   } ], #TODO
+    TLD4   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o,  } ], #TODO
+    TXQ    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o,   } ], #TODO
+
+    LD     => [ { type => $gmemT, code => 0xc000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr;"o,      } ],
+    LDY     => [ { type => $gmemT, code => 0x7f80000000000002, rule => qr"^$pred?LDY $r0, $i20;"o,      } ],
+    LDX     => [ { type => $gmemT, code => 0x7ec0000000000002, rule => qr"^$pred?LDX$memTypeX $r0, $addr;"o,      } ],
+    ST     => [ { type => $gmemT, code => 0xe000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0;"o,      } ],
+    LDG    => [
+    { type => $gmemT, code => 0x600010047f800001, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o,           },
+    ],
+    LDS    => [ { type => $smemT, code => 0x7a40000000000002, rule => qr"^$pred?LDS$memCache$memType$S $r0, $addr;"o,           } ],
+    STS    => [ { type => $smemT, code => 0x7ac0000000000002, rule => qr"^$pred?STS$memCache$memType$S $addr, $r0;"o,           } ],
+    LDL    => [ { type => $gmemT, code => 0x7a00000000000002, rule => qr"^$pred?LDL$ldmemCache$memType$S $r0, $addr;"o,           } ],
+    STL    => [ { type => $gmemT, code => 0x7a80000000000002, rule => qr"^$pred?STL$stmemCache$memType$S $addr, $r0;"o,           } ],
+    LDC    => [ { type => $gmemT, code => 0x7c800000000ffc02, rule => qr"^$pred?LDC$memCache$memType$S $r0, $ldc;"o,            } ],
+    ATOM   => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o,   } ],
+    RED    => [ { type => $gmemT, code => 0x68000000000003fe, rule => qr"^$pred?RED$atom $addr2, $r20;"o,                      } ],
+    CCTL   => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o,  } ], #TODO
+    CCTLL  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO
+
+    SULD   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o,   } ], #TODO
+
+    BRA    => [
+                { type => $x32T, code => 0x120000000000003c, rule => qr"^$pred?BRA(?<U>\.U)? $i20w24;"o,         },
+                { type => $x32T, code => 0x1200000000000000, rule => qr"^$pred?BRA(?<U>\.U)? CC\.$condition, $i20w24;"o,         },
+              ],
+
+    BRX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o,                      } ], #TODO
+    JMP    => [
+    { type => $x32T, code => 0x108000000000003c, rule => qr"^$pred?JMP(?<U>\.U)? $i20w32;"o,         },
+    { type => $x32T, code => 0x1080000000000000, rule => qr"^$pred?JMP(?<U>\.U)? CC\.$condition, $i20w32;"o,         },
+    ],
+    JMX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o,                      } ], #TODO
+    SSY    => [ { type => $x32T, code => 0x1480000000000000, rule => qr"^$noPred?SSY $i20w24;"o,                 } ],
+
+    CAL    => [ { type => $x32T, code => 0x1300000000000100, rule => qr"^$noPred?CAL $i20w24;"o,                 } ],
+    JCAL   => [ { type => $x32T, code => 0x1100000000000100, rule => qr"^$noPred?JCAL $i20w32;"o,                } ],
+    PRET   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o,                     } ], #TODO
+    RET    => [
+    { type => $x32T, code => 0x190000000000003c, rule => qr"^$pred?RET;"o,                           },
+    { type => $x32T, code => 0x1900000000000000, rule => qr"^$pred?RET CC\.$condition;"o,                           },
+    ],
+    BRK    => [ { type => $x32T, code => 0x1a0000000000003c, rule => qr"^$pred?BRK;"o,                           } ],
+    PBK    => [ { type => $x32T, code => 0x1500000000000000, rule => qr"^$noPred?PBK $i20w24;"o,                 } ],
+    CONT   => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o,                          } ],
+    PCNT   => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o,                } ],
+    EXIT   => [
+    { type => $x32T, code => 0x18000000001c003c, rule => qr"^$pred?EXIT;"o,                          },
+    { type => $x32T, code => 0x18000000001c0000, rule => qr"^$pred?EXIT CC\.$condition;"o,                          },
+    ],
+    BPT    => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o,           } ],
+
+    NOP    => [ { type => $x32T,  code => 0x8580000000003c02, rule => qr"^$pred?NOP$S;"o,                                     } ],
+    S2R    => [ { type => $s2rT,  code => 0x8640000000000002, rule => qr"^$pred?S2R$S $r0, $sr;"o,                            } ],
+    B2R    => [ { type => $x32T,  code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o,                                 } ],
+    BAR    => [
+    { type => $gmemT, code => 0x8540dc0000000002, rule => qr"^$pred?BAR.SYNC $i8w4;"o,                                 },
+    { type => $gmemT, code => 0x8540dc0000000002, rule => qr"^$pred?BAR.SYNC $i8w4, $i20w12;"o,                                 },
+    { type => $gmemT, code => 0x85409c0000000002, rule => qr"^$pred?BAR.SYNC $i8w4, $r20;"o,                                 },
+    { type => $gmemT, code => 0x85405c0000000002, rule => qr"^$pred?BAR.SYNC $r8;"o,                                 },
+    { type => $gmemT, code => 0x85405c0000000002, rule => qr"^$pred?BAR.SYNC $r8, $i20w12;"o,                                 },
+    { type => $gmemT, code => 0x85401c0000000002, rule => qr"^$pred?BAR.SYNC $r8, $r20;"o,                                 },
+    { type => $gmemT, code => 0x8540dc0800000002, rule => qr"^$pred?BAR.ARV $i8w4, $i20w12;"o,                                 },
+    { type => $gmemT, code => 0x85409c0800000002, rule => qr"^$pred?BAR.ARV $i8w4, $r20;"o,                                 },
+    { type => $gmemT, code => 0x85405c0800000002, rule => qr"^$pred?BAR.ARV $r8, $i20w12;"o,                                 },
+    { type => $gmemT, code => 0x85401c0800000002, rule => qr"^$pred?BAR.ARV $r8, $r20;"o,                                 },
+    ],
+    DEPBAR => [
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, },
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o,              },
+              ],
+    MEMBAR => [ { type => $x32T,  code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o,                             } ],
+
+    VOTE   => [
+    { type => $voteT, code => 0x86c0000000000002, rule => qr"^$pred?VOTE$vote (?:$r0, |(?<nor0>))$p45, $p39;"o, } ],
+
+
+    VADD   => [   { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMAD   => [
+                  { type => $x32T,  code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $i20, $r39;"o, },
+                  { type => $x32T,  code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, },
+                  { type => $shftT, code => 0xf800000000000002, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, },
+              ],
+    VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMNMX    => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+
+    VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+);
+
+my @flags = grep /\S/, split "\n", q{;
+
+BFE, BFI, FLO, IADD, ISUB, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD
+0x0800000000000000 neg
+
+FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP
+0x0800000000000000 neg
+
+PSET, PSETP
+0x0000000000020000 p12not
+0x0000000800000000 p29not
+
+FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE
+0x0000200000000000 p39not
+
+IADD32I
+0x0010000000000000 CC
+
+IMAD, PSET, FSET, DSET, ISET, IADD, ISUB, IMUL, ISCADD
+0x0004000000000000 CC
+
+IMAD: mode
+0x0200000000000000 HI
+
+IMAD
+0x0010000000000000 X
+
+IMUL: mode
+0x0000040000000000 HI
+
+IMUL32I: mode
+0x0100000000000000 HI
+
+FFMA, FADD, FCMP, FMUL, FMNMX,  FSWZ, FSET, FSETP,  FCHK, RRO,  MUFU, DFMA, DADD, DMUL, DMNMX,  DSET, DSETP,  IMAD, IMADSP, IMUL, IADD, ISCADD, ISAD, IMNMX,  BFE,  BFI,  SHR,  SHL,  SHF,  LOP,  FLO,  ISET, ISETP,  ICMP, POPC, F2F,  F2I,  I2F,  I2I,  MOV, MOV32I, SEL,  PRMT, SHFL, P2R,  R2P,  CSET, CSETP,  PSET, PSETP,  TEX,  TLD,  TLD4, TXQ,  LDC,  LD, LDG,  LDL,  LDS,  LDSLK,  ST, STL,  STS,  STSCUL, ATOM, RED,  CCTL, CCTLL,  MEMBAR, SUCLAMP,  SUBFM,  SUEAU,  SULDGA, SUSTGA, BRA,  BRX,  RET,  BRK,  CONT, NOP,  S2R,  B2R,  BAR,  VOTE, MOV
+0x0000000000400000 S
+
+SHF
+0x0020000000000000 W
+0x0001000000000000 HI
+
+SHF: type
+0x0000020000000000 U64
+0x0000010000000000 S64
+
+IMAD, ICMP, ISET, ISETP, ISAD, SHR, IMNMX, FLO, BFE
+0x0008000000000000 U32
+
+SHR, SHL
+0x0000040000000000 W
+
+SHFL
+0x0000000080000000 i20w8
+0x0000000100000000 i34w13
+
+SHFL: mode
+0x0000000000000000 IDX
+0x0000000200000000 UP
+0x0000000300000000 DOWN
+0x0000000600000000 BFLY
+
+IMNMX: mode
+0x0000080000000000 XLO
+0x0000180000000000 XHI
+
+ISETP, ISET, ICMP: cmp
+0x0010000000000000 LT
+0x0020000000000000 EQ
+0x0030000000000000 LE
+0x0040000000000000 GT
+0x0050000000000000 NE
+0x0060000000000000 GE
+
+ISETP, ISET, PSETP, PSET, FSET, FSETP, DSET, DSETP: bool
+0x0000000000000000 AND
+0x0001000000000000 OR
+0x0002000000000000 XOR
+
+PSETP, PSET: bool2
+0x0000000000000000 AND
+0x0000000008000000 OR
+0x0000000010000000 XOR
+
+ISETP, ISET, IADD, ISUB
+0x0000400000000000 X
+
+ISCADD
+0x0020000000000000 X
+
+ISET, PSET
+0x0000800000000000 BF
+
+LOP: bool
+0x0000000000000000 AND
+0x0000100000000000 OR
+0x0000200000000000 XOR
+0x0000300000000000 PASS_B
+
+LOP, POPC, FLO
+0x0000080000000000 INV
+
+LOP, POPC, IADD, ISUB
+0x0000040000000000 INV1
+
+LOP: z
+0x0000200000000000 Z
+0x0000300000000000 NZ
+
+LOP
+0x0000000000000000 noz
+
+LOP32I: bool
+0x0000000000000000 AND
+0x0020000000000000 OR
+0x0040000000000000 XOR
+
+PRMT: mode
+0x0008000000000000 F4E
+0x0010000000000000 B4E
+0x0018000000000000 RC8
+0x0020000000000000 ECL
+0x0028000000000000 ECR
+0x0030000000000000 RC16
+
+IMAD: type1
+0x0008000000000000 U32
+0x0008000000000000 S32
+
+IMAD: type2
+0x0100000000000000 U32
+0x0100000000000000 S32
+
+IMUL: type1
+0x0000080000000000 U32
+0x0000000000000000 S32
+
+IMUL: type2
+0x0000100000000000 U32
+0x0000000000000000 S32
+
+IMUL32I: type1
+0x0200000000000000 U32
+0x0000000000000000 S32
+
+IMUL32I: type2
+0x0400000000000000 U32
+0x0000000000000000 S32
+
+XMAD: type1
+0x0000000000000000 U16
+0x0001000000000000 S16
+
+XMAD: type2
+0x0000000000000000 U16
+0x0002000000000000 S16
+
+XMAD: mode
+0x0000002000000000 MRG
+0x0000001000000000 PSL
+0x0008000000000000 CHI
+0x0004000000000000 CLO
+0x000c000000000000 CSFU
+
+XMAD: modec
+0x0004000000000000 CLO
+0x0008000000000000 CHI
+0x000c000000000000 CSFU
+0x0040000000000000 X
+0x0080000000000000 PSL
+0x0100000000000000 MRG
+
+XMAD
+0x0010000000000000 CBCC
+
+XMAD: r8part
+0x0000000000000000 H0
+0x0020000000000000 H1
+
+XMAD: r20part
+0x0000000000000000 H0
+0x0000000800000000 H1
+
+XMAD: r20partx
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+XMAD: r39part
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part
+0x0000000000000000 B0
+0x0000001000000000 B1
+0x0000002000000000 B2
+0x0000003000000000 B3
+0x0000001000000000 H1
+0x0000000000000000 H0
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part
+0x0000000000000000 B0
+0x0000000010000000 B1
+0x0000000020000000 B2
+0x0000000030000000 B3
+0x0000000010000000 H1
+0x0000000000000000 H0
+
+VMAD
+0x0040000000000000 r8neg
+0x0020000000000000 r39neg
+0x0008000000000000 SHR_7
+0x0010000000000000 SHR_15
+0x0060000000000000 PO
+0x0080000000000000 SAT
+
+VMNMX
+0x0100000000000000 MX
+
+VADD, VABSDIFF, VMNMX
+0x0080000000000000 SAT
+0x0040000000000000 UD
+0x0040000000000000 SD
+
+VSET: cmp
+0x0040000000000000 LT
+0x0080000000000000 EQ
+0x00c0000000000000 LE
+0x0100000000000000 GT
+0x0140000000000000 NE
+0x0180000000000000 GE
+
+VADD, VSET: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VABSDIFF: mode
+0x0003000000000000 ACC
+0x000b000000000000 MIN
+0x0013000000000000 MAX
+0x0023000000000000 MRG_16H
+0x002b000000000000 MRG_16L
+0x0033000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x003b000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMNMX: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1
+0x0000000000000000 U
+0x0004000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2
+0x0000000000000000 U
+0x0008000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size1
+0x0000000000000000 8
+0x0000004000000000 16
+0x0000006000000000 32
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size2
+0x0000000000000000 8
+0x0000000000000000 16
+0x0000000000000000 32
+
+IADD3: type
+0x0001000000000000 X
+0x0000002000000000 RS
+0x0000004000000000 LS
+
+IADD3: r8part
+0x0000000000000000 H0
+0x0000001000000000 H1
+
+IADD3: r20part
+0x0000000080000000 H0
+
+IADD3: r39part
+0x0000000200000000 H0
+
+IADD3
+0x0008000000000000 r8neg
+0x0004000000000000 r20neg
+0x0002000000000000 r39neg
+
+IADD, ISUB, ISCADD
+0x0010000000000000 r8neg
+0x0008000000000000 r20neg
+0x0018000000000000 PO
+
+IADD32I
+0x0100000000000000 X
+0x0800000000000000 r8neg
+
+IMAD
+0x0080000000000000 r8neg
+
+IMAD
+0x0040000000000000 r39neg
+
+DEPBAR: SB
+0x0000000000000000 SB0
+0x0000000004000000 SB1
+0x0000000008000000 SB2
+0x000000000c000000 SB3
+0x0000000010000000 SB4
+0x0000000014000000 SB5
+
+DEPBAR: cmp
+0x0000000020000000 LE
+
+DEPBAR
+0x0000000000000001 db0
+0x0000000000000002 db1
+0x0000000000000004 db2
+0x0000000000000008 db3
+0x0000000000000010 db4
+0x0000000000000020 db5
+
+F2F, F2I, I2F, I2I: destWidth
+0x0000000000000000 8
+0x0000000000000400 16
+0x0000000000000800 32
+0x0000000000000c00 64
+
+F2F, F2I, I2F, I2I: srcWidth
+0x0000000000000000 8
+0x0000000000001000 16
+0x0000000000002000 32
+0x0000000000003000 64
+
+F2F, F2I, I2F, I2I: destSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000008000 S
+
+F2F, F2I, I2F, I2I: srcSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000008000 S
+
+F2I, I2F, I2I: r20part
+0x0000000000000000 H0
+0x0000040000000000 H1
+0x0000000000000000 B0
+0x0000020000000000 B1
+0x0000040000000000 B2
+0x0000060000000000 B3
+
+F2F: r20part
+0x0000000000000000 H0
+0x0000020000000000 H1
+
+F2F: round
+0x0000040000000000 ROUND
+0x0000048000000000 FLOOR
+0x0000050000000000 CEIL
+0x0000058000000000 TRUNC
+
+F2I: round
+0x0000000000000000 ROUND
+0x0000040000000000 FLOOR
+0x0000080000000000 CEIL
+0x00000c0000000000 TRUNC
+
+HADD2, HMUL2: r8part
+0x0001000000000000 H0_H0
+0x0000000000000000 H1_H1
+
+HFMA2: r20part
+0x0000000020000000 H0_H0
+0x0000000030000000 H1_H1
+
+FADD, DADD, FMUL, DMUL, F2F, I2F: rnd
+0x0000000000000000 RN
+0x0000040000000000 RM
+0x0000080000000000 RP
+0x00000c0000000000 RZ
+
+FMUL: mulf
+0x0000100000000000 D2
+0x0000200000000000 D4
+0x0000300000000000 D8
+0x0000400000000000 M8
+0x0000500000000000 M4
+0x0000600000000000 M2
+
+BRA, JMP, RET, EXIT: CON
+0x0000000000000000 F
+0x0000000000000004 LT
+0x0000000000000008 EQ
+0x000000000000000c LE
+0x0000000000000010 GT
+0x0000000000000014 NE
+0x0000000000000018 GE
+0x000000000000001c NUM
+0x0000000000000020 NAN
+0x0000000000000024 LTU
+0x0000000000000028 EQU
+0x000000000000002c LEU
+0x0000000000000030 GTU
+0x0000000000000034 NEU
+0x0000000000000038 GEU
+0x0000000000000040 OFF
+0x0000000000000044 LO
+0x0000000000000048 SFF
+0x000000000000004c LS
+0x0000000000000050 HI
+0x0000000000000054 SFT
+0x0000000000000058 HS
+0x000000000000005c OFT
+
+MOV: lane2a
+0x0000380000000000 LNONE
+0x0000340000000000 L0
+0x0000300000000000 L1
+0x00002c0000000000 L01
+0x0000280000000000 L2
+0x0000240000000000 L02
+0x0000200000000000 L12
+0x00001c0000000000 L3
+0x0000180000000000 L03
+0x0000140000000000 L13
+0x0000100000000000 L013
+0x00000c0000000000 L23
+0x0000080000000000 L023
+0x0000040000000000 L123
+
+MOV32I: lane0e
+0x0000000000038000 LNONE
+0x0000000000034000 L0
+0x0000000000030000 L1
+0x000000000002c000 L01
+0x0000000000028000 L2
+0x0000000000024000 L02
+0x0000000000020000 L12
+0x000000000001c000 L3
+0x0000000000018000 L03
+0x0000000000014000 L13
+0x0000000000010000 L013
+0x000000000000c000 L23
+0x0000000000008000 L023
+0x0000000000004000 L123
+
+DFMA: rnd
+0x0000000000000000 RN
+0x0004000000000000 RM
+0x0008000000000000 RP
+0x000c000000000000 RZ
+
+FFMA: rnd
+0x0000000000000000 RN
+0x0040000000000000 RM
+0x0080000000000000 RP
+0x00c0000000000000 RZ
+
+FFMA, FMUL32I
+0x0100000000000000 FTZ
+
+F2F, F2I, FADD, FMUL, FMNMX
+0x0000800000000000 FTZ
+
+FADD32I
+0x0080000000000000 FTZ
+
+FMUL32I
+0x0020000000000000 FTZ
+
+FSET, FSETP, FCMP, DSET, DSETP
+0x0400000000000000 FTZ
+
+HADD2, HMUL2
+0x0000008000000000 FTZ
+
+HFMA2
+0x0000002000000000 FTZ
+
+FADD, FFMA, FMUL, F2F, I2I, MUFU, IMAD, IADD, ISUB
+0x0020000000000000 SAT
+
+FADD, DADD, FMNMX, DMNMX, MUFU, FFMA, DFMA, FMUL, DADD, DMUL
+0x0008000000000000 r8neg
+
+FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I
+0x0001000000000000 r20neg
+
+FMUL, DMUL, FFMA, DFMA
+0x0001000000000000 r20neg
+
+FFMA, DFMA
+0x0010000000000000 r39neg
+
+FADD, DADD, FMNMX, DMNMX, MUFU
+0x0002000000000000 r8abs
+
+FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I
+0x0010000000000000 r20abs
+
+FSETP, DSETP, FSET, DSET
+0x0000400000000000 r8neg
+0x0100000000000000 r20neg
+0x0200000000000000 r8abs
+0x0000800000000000 r20abs
+
+RRO: func
+0x0000000000000000 SINCOS
+0x0000040000000000 EX2
+
+MUFU: func
+0x0000000000000000 COS
+0x0000000000800000 SIN
+0x0000000001000000 EX2
+0x0000000001800000 LG2
+0x0000000002000000 RCP
+0x0000000002800000 RSQ
+0x0000000003000000 RCP64H
+0x0000000003800000 RSQ64H
+
+FSETP, DSETP, FSET, DSET, FCMP: cmp
+0x0008000000000000 .LT
+0x0010000000000000 .EQ
+0x0018000000000000 .LE
+0x0020000000000000 .GT
+0x0020000000000000
+0x0028000000000000 .NE
+0x0030000000000000 .GE
+0x0038000000000000 .NUM
+0x0040000000000000 .NAN
+0x0048000000000000 .LTU
+0x0050000000000000 .EQU
+0x0058000000000000 .LEU
+0x0060000000000000 .GTU
+0x0068000000000000 .NEU
+0x0070000000000000 .GEU
+
+FSETP, DSETP, FSET, DSET: bool
+0x0000000000000000 AND
+0x0001000000000000 OR
+0x0002000000000000 XOR
+
+HSETP2: cmp
+0x0000002800000000 .NE
+
+HSETP2: bool
+0x0000000000000000 AND
+
+S2R: sr
+0x0000000000000000  LANEID
+0x0000000001000000  VIRTCFG
+0x0000000001800000  VIRTID
+0x0000000002000000  PM0
+0x0000000002800000  PM1
+0x0000000003000000  PM2
+0x0000000003800000  PM3
+0x0000000004000000  PM4
+0x0000000004800000  PM5
+0x0000000005000000  PM6
+0x0000000005800000  PM7
+0x0000000008000000  PRIM_TYPE
+0x0000000008800000  INVOCATION_ID
+0x0000000009000000  Y_DIRECTION
+0x0000000010000000  TID
+0x0000000010800000  TID.X
+0x0000000011000000  TID.Y
+0x0000000011800000  TID.Z
+0x0000000012000000  CTA_PARAM
+0x0000000012800000  CTAID.X
+0x0000000013000000  CTAID.Y
+0x0000000013800000  CTAID.Z
+0x0000000014000000  NTID
+0x0000000014800000  CirQueueIncrMinusOne
+0x0000000015000000  NLATC
+0x0000000015800000  43
+0x0000000016000000  44
+0x0000000016800000  45
+0x0000000017000000  46
+0x0000000017800000  47
+0x0000000018000000  SWINLO
+0x0000000018800000  SWINSZ
+0x0000000019000000  SMEMSZ
+0x0000000019800000  SMEMBANKS
+0x000000001a000000  LWINLO
+0x000000001a800000  LWINSZ
+0x000000001b000000  LMEMLOSZ
+0x000000001b800000  LMEMHIOFF
+0x000000001c000000  EQMASK
+0x000000001c800000  LTMASK
+0x000000001d000000  LEMASK
+0x000000001d800000  GTMASK
+0x000000001e000000  GEMASK
+0x0000000020000000  GLOBALERRORSTATUS
+0x0000000021000000  WARPERRORSTATUS
+0x0000000028000000  CLOCKLO
+0x0000000029000000  GLOBALTIMERLO
+0x0000000029800000  GLOBALTIMERHI
+
+CS2R: sr
+0x0000000005000000 CLOCKLO
+0x0000000005100000 CLOCKHI
+0x0000000005200000 GLOBALTIMERLO
+0x0000000005300000 GLOBALTIMERHI
+
+B2R
+0x0000e00000000000 nop45
+
+BAR: red
+0x0000000000000000 POPC
+0x0000000800000000 AND
+0x0000001000000000 OR
+
+MEMBAR: mode
+0x0000000000000000 CTA
+0x0000000000000100 GL
+0x0000000000000200 SYS
+
+VOTE: mode
+0x0000000000000000 ALL
+0x0008000000000000 ANY
+0x0010000000000000 EQ
+
+VOTE
+0x00000000000003fc nor0
+
+BRA
+0x0000000000000200 U
+
+TLDS: chnls
+0x0010000000000000 RGBA
+
+TLDS
+0x0002000000000000 NODEP
+
+LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS
+0x0000000000000000 nor8
+
+LD, ST: type
+0x0000000000000000 .U8
+0x0100000000000000 .S8
+0x0200000000000000 .U16
+0x0300000000000000 .S16
+0x0400000000000000
+0x0400000000000000 .32
+0x0500000000000000 .64
+0x0600000000000000 .128
+
+LDX: type
+0x0000000000000000 .b32
+0x0004000000000000 .b64
+0x0008000000000000 .b96
+0x000c000000000000 .b128
+
+LD, ST: cache
+0x0000000000000000 CG
+0x1000000000000000 CS
+0x1800000000000000 CV
+0x1800000000000000 WT
+
+STG, LDS, STS, LDL, STL, LDC: type
+0x0000000000000000 .U8
+0x0008000000000000 .S8
+0x0010000000000000 .U16
+0x0018000000000000 .S16
+0x0020000000000000
+0x0020000000000000 .32
+0x0028000000000000 .64
+0x0030000000000000 .128
+
+LDG: type
+0x0000000000000000 .U8
+0x0000800000000000 .S8
+0x0001000000000000 .U16
+0x0001800000000000 .S16
+0x0002000000000000
+0x0002000000000000 .32
+0x0002800800000000 .64
+0x0003003800000000 .128
+
+LDG, STG: cache
+0x0000000000000000 CG
+0x0000000000000000 CI
+0x0000040000000000 CS
+0x0000000000000000 CV
+0x0000000000000000 WT
+
+LDG
+0x0000008000000000 E
+
+LDL: cache
+0x0000200000000000 CI
+
+LDL, STL: cache
+0x0000800000000000 CG
+0x0001000000000000 LU
+0x0001800000000000 CV
+0x0001800000000000 WT
+
+LDC: cache
+0x0000100000000000 IL
+
+STG, LDS, STS, LDL, STL, LDC
+0x0000200000000000 E
+
+LDS
+0x0008000000000000 U
+
+RED: type
+0x0000000000000000
+0x0010000000000000 .S32
+0x0020000000000000 .U64
+0x0030000000000000 .F32.FTZ.RN
+0x0040000000000000 .F16x2.FTZ.RN
+0x0050000000000000 .S64
+
+RED: mode
+0x0000000000000000 ADD
+0x0080000000000000 MIN
+0x0100000000000000 MAX
+0x0180000000000000 INC
+0x0200000000000000 DEC
+0x0280000000000000 AND
+0x0300000000000000 OR
+0x0380000000000000 XOR
+
+ATOM: type
+0x0000000000000000
+0x0002000000000000 .S32
+0x0004000000000000 .U64
+0x0006000000000000 .F32.FTZ.RN
+0x0008000000000000 .F16x2.FTZ.RN
+0x000a000000000000 .S64
+0x0002000000000000 .64
+
+ATOM, RED
+0x0008000000000000 E
+
+LD, ST
+0x0080000000000000 E
+
+ATOM: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x03f0000000000000 CAS
+
+ATOMS: type
+0x0000000000000000
+0x0000000010000000 .S32
+0x0000000020000000 .U64
+0x0000000030000000 .S64
+0x0010000000000000 .64
+
+ATOMS: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x0240000000000000 CAS
+
+BFE:REV2B
+0x0000080000000000 REV2B
+};
+
+our %flags;
+my (@ops, $flag);
+foreach my $line (@flags)
+{
+    if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)')
+    {
+        my $val = hex($1);
+        if ($flag)
+            { $flags{$_}{$flag}{$2} = $val foreach @ops; }
+        else
+            { $flags{$_}{$2}        = $val foreach @ops; }
+    }
+    else
+    {
+        my ($ops, $name) = split ':\s*', $line;
+        @ops = split ',\s*', $ops;
+        $flag = $name;
+    }
+}
+
+sub parseInstruct
+{
+    my ($inst, $grammar) = @_;
+    return unless $inst =~ $grammar->{rule};
+    my %capData = %+;
+    return \%capData;
+}
+
+my %immedOps = map { $_ => 1 } qw(i20 f20 d20);
+my %immedCodes =
+(
+    0x5c => 0x64,
+    0x5b => 0x6d,
+    0x59 => 0x6b,
+    0x58 => 0x68,
+);
+my %constCodes =
+(
+    c20 => 0x2,
+    c39 => 0x1,
+);
+my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4);
+
+sub genReuseCode
+{
+    my $capData = shift;
+    my $reuse = 0;
+    $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes;
+    return $reuse;
+}
+
+sub genCode
+{
+    my ($op, $grammar, $capData, $test) = @_;
+
+    my $flags     = $flags{$op};
+    my $code      = $grammar->{code};
+    my $reuse     = 0;
+
+
+    if (exists $capData->{noPred})
+    {
+        delete $capData->{noPred};
+        push @$test, 'noPred' if $test;
+    }
+    else
+    {
+        my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7;
+        push @$test, 'predNum' if $test;
+        if (exists $capData->{predNot})
+        {
+            $p |= 8;
+            push @$test, 'predNot' if $test;
+        }
+        $code |= $p << 18;
+        delete @{$capData}{qw(predNum predNot)};
+
+    }
+    foreach my $rcode (qw(reuse1 reuse2 reuse3))
+    {
+        if (delete $capData->{$rcode})
+        {
+            $reuse |= $reuseCodes{$rcode};
+            push @$test, $rcode if $test;
+        }
+    }
+
+    foreach my $capture (keys %$capData)
+    {
+        if (exists $constCodes{$capture})
+            { $code ^= $constCodes{$capture} << 62; }
+
+        if (exists $operands{$capture})
+        {
+            unless ($capture eq 'r20' && exists $capData->{r39s20})
+            {
+                $code ^= $operands{$capture}->($capData->{$capture});
+                push @$test, $capture if $test;
+            }
+        }
+
+        if (exists $flags->{$capture})
+        {
+            if (ref $flags->{$capture})
+            {
+                $code ^= $flags->{$capture}{$capData->{$capture}};
+                push @$test, "$capture:$capData->{$capture}" if $test;
+            }
+            else
+            {
+                $code ^= $flags->{$capture};
+                push @$test, $capture if $test;
+            }
+        }
+        elsif (!exists $operands{$capture} && !$test)
+        {
+            warn "UNUSED: $op: $capture: $capData->{$capture}\n";
+            warn Dumper($flags);
+        }
+    }
+
+    return $code, $reuse;
+}
+
+
+my $CtrlRe = qr'(?<ctrl>[T\-]:[G\-]:[D\-]:[S\-]:[0-9]{2})';
+my $PredRe = qr'(?<pred>@!?(?<predReg>P\d)\s+)';
+my $InstRe = qr"$PredRe?(?<op>\w+)(?<rest>[^;]*;)"o;
+my $CommRe = qr'(?<comment>.*)';
+
+sub processAsmLine
+{
+    my ($line, $lineNum) = @_;
+
+    if ($line =~ m"^$CtrlRe(?<space>\s+)$InstRe$CommRe"o)
+    {
+        return {
+            lineNum => $lineNum,
+            pred    => $+{pred},
+            predReg => $+{predReg},
+            space   => $+{space},
+            op      => $+{op},
+            comment => $+{comment},
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            ctrl    => readCtrl($+{ctrl}, $line),
+        };
+    }
+    return undef;
+}
+
+sub processSassLine
+{
+    my $line = shift;
+
+    if ($line =~ m"^\s+/\*(?<num>[0-9a-f]+)\*/\s+$InstRe\s+/\* (?<code>0x[0-9a-f]+)"o)
+    {
+        return {
+            num     => hex($+{num}),
+            pred    => $+{pred},
+            op      => $+{op},
+            ins     => normalizeSpacing($+{op} . $+{rest}),
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            code    => hex($+{code}),
+        };
+    }
+    return undef;
+}
+
+sub processSassCtrlLine
+{
+    my ($line, $ctrl, $ruse) = @_;
+
+    return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)';
+
+    my $code = hex($1);
+    if (ref $ctrl)
+    {
+        push @$ctrl, ($code & 0x00000000000003fc) >> 2;
+        push @$ctrl, ($code & 0x000000000003fc00) >> 10;
+        push @$ctrl, ($code & 0x0000000003fc0000) >> 18;
+        push @$ctrl, ($code & 0x00000003fc000000) >> 26;
+        push @$ctrl, ($code & 0x000003fc00000000) >> 34;
+        push @$ctrl, ($code & 0x0003fc0000000000) >> 42;
+        push @$ctrl, ($code & 0x03fc000000000000) >> 50;
+    }
+    if (ref $ruse)
+    {
+        push @$ruse, ($code & 0x00000000001e0000) >> 17;
+        push @$ruse, ($code & 0x000003c000000000) >> 38;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+        push @$ruse, ($code & 0x00000000001e0000) >> 17;
+        push @$ruse, ($code & 0x000003c000000000) >> 38;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+    }
+    return 1;
+}
+
+sub replaceXMADs
+{
+    my $file = shift;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD\.LO\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<x>\w+)\s*;$CommRe/
+
+        die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s
+%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s;
+%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;',
+                @+{qw(ctrl space pred d a b c x comment)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>-?$immed|\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/
+
+        die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<c>\w+)\s*;$CommRe/
+
+        die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    return $file;
+}
+sub normalizeSpacing
+{
+    my $inst = shift;
+    $inst =~ s/\t/ /g;
+    $inst =~ s/\s{2,}/ /g;
+    return $inst;
+}
+
+
+sub printCtrl
+{
+    my $code = shift;
+
+    my $stall = ($code & 0x0f) >> 0;
+    my $sharedbar = ($code & 0x10) >> 4;
+    my $dual_issue = ($code & 0x20) >> 5;
+    my $globalbar = ($code & 0x40) >> 6;
+    my $texbar = ($code & 0x80) >> 7;
+
+    $texbar = $texbar ? 'T' : '-';
+    $globalbar = $globalbar ? 'G' : '-';
+    $dual_issue = $dual_issue ? '-' : 'D';
+    $sharedbar = $sharedbar ? 'S' : '-';
+    $stall = sprintf('%02d', $stall);
+    return sprintf '%s:%s:%s:%s:%02d', $texbar, $globalbar, $dual_issue, $sharedbar, $stall;
+}
+sub readCtrl
+{
+    my ($ctrl, $context) = @_;
+    my ($texbar, $globalbar, $dual_issue, $sharedbar, $stall) = split ':', $ctrl;
+
+    $texbar= $texbar eq 'T' ? 1 : 0;
+    $globalbar= $globalbar eq 'G' ? 1 : 0;
+    $dual_issue= $dual_issue eq 'D' ? 0 : 1;
+    $sharedbar= $sharedbar eq 'S' ? 1 : 0;
+    $stall = sprintf("%d", $stall);
+
+
+
+    return
+        $texbar << 7 |
+        $globalbar << 6 |
+        $dual_issue << 5 |
+        $sharedbar << 4 |
+        $stall;
+}
+
+sub getRegNum
+{
+    my ($regMap, $regName) = @_;
+
+    return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName};
+}
+
+sub getVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r0} or return;
+
+    return if $regName eq 'RZ';
+
+    if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+3);
+        }
+        confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4;
+        return @{$vectors->{$regName}};
+    }
+    return $regName;
+}
+
+sub getAddrVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r8} or return;
+
+    return if $regName eq 'RZ';
+
+    if (exists $capData->{E})
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        print Dumper($vectors) unless exists $vectors->{$regName};
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    return $regName;
+}
+
+__END__
+
+
+
diff --git a/Assembler/KeplerAs/pm_to_blib b/Assembler/KeplerAs/pm_to_blib
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/Changes b/Assembler/MaxAs/Changes
new file mode 100644
index 0000000..a6d8a13
--- /dev/null
+++ b/Assembler/MaxAs/Changes
@@ -0,0 +1,4 @@
+Revision history for Perl extension MaxAs::MaxAs.
+
+1.01  Thu Mar 26 17:09:57 2015
+	- original Perl packaged version
diff --git a/Assembler/MaxAs/Install.sh b/Assembler/MaxAs/Install.sh
new file mode 100755
index 0000000..57c8d24
--- /dev/null
+++ b/Assembler/MaxAs/Install.sh
@@ -0,0 +1,3 @@
+perl Makefile.PL
+make
+sudo make install
diff --git a/Assembler/MaxAs/LICENSE b/Assembler/MaxAs/LICENSE
new file mode 100644
index 0000000..6c28fad
--- /dev/null
+++ b/Assembler/MaxAs/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Assembler/MaxAs/MANIFEST b/Assembler/MaxAs/MANIFEST
new file mode 100644
index 0000000..a25084c
--- /dev/null
+++ b/Assembler/MaxAs/MANIFEST
@@ -0,0 +1,38 @@
+bin/maxas.pl
+Changes
+lib/MaxAs/Cubin.pm
+lib/MaxAs/MaxAs.pm
+lib/MaxAs/MaxAsGrammar.pm
+LICENSE
+Makefile.PL
+MANIFEST
+microbench/microbench.cpp
+microbench/microbench.cu
+microbench/microbench.sass
+microbench/shared.pl
+microbench/shared_lds.sass
+microbench/shared_sts16.sass
+microbench/throughput.pl
+microbench/throughput.sass
+microbench/throughput2.pl
+microbench/throughput2.sass
+microbench/throughput3.pl
+microbench/throughput4.pl
+microbench/throughput5.pl
+microbench/xmad.pl
+microbench/xmad2.sass
+README.md
+sgemm/batched_gemm.xlsx
+sgemm/cublas_sgemm.ptx
+sgemm/sgemm.cpp
+sgemm/sgemm.cu
+sgemm/sgemm.pl
+sgemm/sgemm.sln
+sgemm/sgemm.vcxproj
+sgemm/sgemm128.sass
+sgemm/sgemm64.sass
+sgemm/sgemm_final_128.sass
+sgemm/sgemm_final_64.sass
+sgemm/sgemm_pre_128.sass
+sgemm/sgemm_pre_64.sass
+t/MaxAs-MaxAs.t
diff --git a/Assembler/MaxAs/Makefile b/Assembler/MaxAs/Makefile
new file mode 100644
index 0000000..79e0de9
--- /dev/null
+++ b/Assembler/MaxAs/Makefile
@@ -0,0 +1,840 @@
+# This Makefile is for the MaxAs::MaxAs extension to perl.
+#
+# It was generated automatically by MakeMaker version
+# 6.55_02 (Revision: 65502) from the contents of
+# Makefile.PL. Don't edit this file, edit Makefile.PL instead.
+#
+#       ANY CHANGES MADE HERE WILL BE LOST!
+#
+#   MakeMaker ARGV: ()
+#
+
+#   MakeMaker Parameters:
+
+#     ABSTRACT_FROM => q[lib/MaxAs/MaxAs.pm]
+#     AUTHOR => q[Scott Gray <sgray@nervanasys.com>]
+#     BUILD_REQUIRES => {  }
+#     EXE_FILES => [q[bin/maxas.pl]]
+#     LICENSE => q[MIT]
+#     NAME => q[MaxAs::MaxAs]
+#     PREREQ_PM => { Data::Dumper=>q[2.145], Carp=>q[1.29] }
+#     VERSION_FROM => q[lib/MaxAs/MaxAs.pm]
+
+# --- MakeMaker post_initialize section:
+
+
+# --- MakeMaker const_config section:
+
+# These definitions are from config.sh (via /usr/lib64/perl5/Config.pm).
+# They may have been overridden via Makefile.PL or on the command line.
+AR = ar
+CC = gcc
+CCCDLFLAGS = -fPIC
+CCDLFLAGS = -Wl,-E -Wl,-rpath,/usr/lib64/perl5/CORE
+DLEXT = so
+DLSRC = dl_dlopen.xs
+EXE_EXT = 
+FULL_AR = /usr/bin/ar
+LD = gcc
+LDDLFLAGS = -shared -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic
+LDFLAGS =  -fstack-protector
+LIBC = 
+LIB_EXT = .a
+OBJ_EXT = .o
+OSNAME = linux
+OSVERS = 2.6.18-308.1.1.el5
+RANLIB = :
+SITELIBEXP = /usr/local/share/perl5
+SITEARCHEXP = /usr/local/lib64/perl5
+SO = so
+VENDORARCHEXP = /usr/lib64/perl5/vendor_perl
+VENDORLIBEXP = /usr/share/perl5/vendor_perl
+
+
+# --- MakeMaker constants section:
+AR_STATIC_ARGS = cr
+DIRFILESEP = /
+DFSEP = $(DIRFILESEP)
+NAME = MaxAs::MaxAs
+NAME_SYM = MaxAs_MaxAs
+VERSION = 1.06
+VERSION_MACRO = VERSION
+VERSION_SYM = 1_06
+DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\"
+XS_VERSION = 1.06
+XS_VERSION_MACRO = XS_VERSION
+XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\"
+INST_ARCHLIB = blib/arch
+INST_SCRIPT = blib/script
+INST_BIN = blib/bin
+INST_LIB = blib/lib
+INST_MAN1DIR = blib/man1
+INST_MAN3DIR = blib/man3
+MAN1EXT = 1
+MAN3EXT = 3pm
+INSTALLDIRS = site
+DESTDIR = 
+PREFIX = $(SITEPREFIX)
+PERLPREFIX = /usr
+SITEPREFIX = /usr/local
+VENDORPREFIX = /usr
+INSTALLPRIVLIB = /usr/share/perl5
+DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB)
+INSTALLSITELIB = /usr/local/share/perl5
+DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB)
+INSTALLVENDORLIB = /usr/share/perl5/vendor_perl
+DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB)
+INSTALLARCHLIB = /usr/lib64/perl5
+DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB)
+INSTALLSITEARCH = /usr/local/lib64/perl5
+DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH)
+INSTALLVENDORARCH = /usr/lib64/perl5/vendor_perl
+DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH)
+INSTALLBIN = /usr/bin
+DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN)
+INSTALLSITEBIN = /usr/local/bin
+DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN)
+INSTALLVENDORBIN = /usr/bin
+DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN)
+INSTALLSCRIPT = /usr/bin
+DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT)
+INSTALLSITESCRIPT = /usr/local/bin
+DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT)
+INSTALLVENDORSCRIPT = /usr/bin
+DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT)
+INSTALLMAN1DIR = /usr/share/man/man1
+DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR)
+INSTALLSITEMAN1DIR = /usr/local/share/man/man1
+DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR)
+INSTALLVENDORMAN1DIR = /usr/share/man/man1
+DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR)
+INSTALLMAN3DIR = /usr/share/man/man3
+DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR)
+INSTALLSITEMAN3DIR = /usr/local/share/man/man3
+DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR)
+INSTALLVENDORMAN3DIR = /usr/share/man/man3
+DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR)
+PERL_LIB = /usr/share/perl5
+PERL_ARCHLIB = /usr/lib64/perl5
+LIBPERL_A = libperl.a
+FIRST_MAKEFILE = Makefile
+MAKEFILE_OLD = Makefile.old
+MAKE_APERL_FILE = Makefile.aperl
+PERLMAINCC = $(CC)
+PERL_INC = /usr/lib64/perl5/CORE
+PERL = /usr/bin/perl
+FULLPERL = /usr/bin/perl
+ABSPERL = $(PERL)
+PERLRUN = $(PERL)
+FULLPERLRUN = $(FULLPERL)
+ABSPERLRUN = $(ABSPERL)
+PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
+FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
+ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
+PERL_CORE = 0
+PERM_DIR = 755
+PERM_RW = 644
+PERM_RWX = 755
+
+MAKEMAKER   = /usr/share/perl5/ExtUtils/MakeMaker.pm
+MM_VERSION  = 6.55_02
+MM_REVISION = 65502
+
+# FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle).
+# BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle)
+# PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar)
+# DLBASE  = Basename part of dynamic library. May be just equal BASEEXT.
+MAKE = make
+FULLEXT = MaxAs/MaxAs
+BASEEXT = MaxAs
+PARENT_NAME = MaxAs
+DLBASE = $(BASEEXT)
+VERSION_FROM = lib/MaxAs/MaxAs.pm
+OBJECT = 
+LDFROM = $(OBJECT)
+LINKTYPE = dynamic
+BOOTDEP = 
+
+# Handy lists of source code files:
+XS_FILES = 
+C_FILES  = 
+O_FILES  = 
+H_FILES  = 
+MAN1PODS = 
+MAN3PODS = lib/MaxAs/MaxAs.pm
+
+# Where is the Config information that we are using/depend on
+CONFIGDEP = $(PERL_ARCHLIB)$(DFSEP)Config.pm $(PERL_INC)$(DFSEP)config.h
+
+# Where to build things
+INST_LIBDIR      = $(INST_LIB)/MaxAs
+INST_ARCHLIBDIR  = $(INST_ARCHLIB)/MaxAs
+
+INST_AUTODIR     = $(INST_LIB)/auto/$(FULLEXT)
+INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT)
+
+INST_STATIC      = 
+INST_DYNAMIC     = 
+INST_BOOT        = 
+
+# Extra linker info
+EXPORT_LIST        = 
+PERL_ARCHIVE       = 
+PERL_ARCHIVE_AFTER = 
+
+
+TO_INST_PM = lib/MaxAs/Cubin.pm \
+	lib/MaxAs/MaxAs.pm \
+	lib/MaxAs/MaxAsGrammar.pm
+
+PM_TO_BLIB = lib/MaxAs/MaxAs.pm \
+	blib/lib/MaxAs/MaxAs.pm \
+	lib/MaxAs/Cubin.pm \
+	blib/lib/MaxAs/Cubin.pm \
+	lib/MaxAs/MaxAsGrammar.pm \
+	blib/lib/MaxAs/MaxAsGrammar.pm
+
+
+# --- MakeMaker platform_constants section:
+MM_Unix_VERSION = 6.55_02
+PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc
+
+
+# --- MakeMaker tool_autosplit section:
+# Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto
+AUTOSPLITFILE = $(ABSPERLRUN)  -e 'use AutoSplit;  autosplit($$ARGV[0], $$ARGV[1], 0, 1, 1)' --
+
+
+
+# --- MakeMaker tool_xsubpp section:
+
+
+# --- MakeMaker tools_other section:
+SHELL = /bin/sh
+CHMOD = chmod
+CP = cp
+MV = mv
+NOOP = $(TRUE)
+NOECHO = @
+RM_F = rm -f
+RM_RF = rm -rf
+TEST_F = test -f
+TOUCH = touch
+UMASK_NULL = umask 0
+DEV_NULL = > /dev/null 2>&1
+MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' --
+EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' --
+FALSE = false
+TRUE = true
+ECHO = echo
+ECHO_N = echo -n
+UNINST = 0
+VERBINST = 0
+MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' --
+DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' --
+UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' --
+WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' --
+MACROSTART = 
+MACROEND = 
+USEMAKEFILE = -f
+FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' --
+
+
+# --- MakeMaker makemakerdflt section:
+makemakerdflt : all
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker dist section:
+TAR = tar
+TARFLAGS = cvf
+ZIP = zip
+ZIPFLAGS = -r
+COMPRESS = gzip --best
+SUFFIX = .gz
+SHAR = shar
+PREOP = $(NOECHO) $(NOOP)
+POSTOP = $(NOECHO) $(NOOP)
+TO_UNIX = $(NOECHO) $(NOOP)
+CI = ci -u
+RCS_LABEL = rcs -Nv$(VERSION_SYM): -q
+DIST_CP = best
+DIST_DEFAULT = tardist
+DISTNAME = MaxAs-MaxAs
+DISTVNAME = MaxAs-MaxAs-1.06
+
+
+# --- MakeMaker macro section:
+
+
+# --- MakeMaker depend section:
+
+
+# --- MakeMaker cflags section:
+
+
+# --- MakeMaker const_loadlibs section:
+
+
+# --- MakeMaker const_cccmd section:
+
+
+# --- MakeMaker post_constants section:
+
+
+# --- MakeMaker pasthru section:
+
+PASTHRU = LIBPERL_A="$(LIBPERL_A)"\
+	LINKTYPE="$(LINKTYPE)"\
+	PREFIX="$(PREFIX)"
+
+
+# --- MakeMaker special_targets section:
+.SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT)
+
+.PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir
+
+
+
+# --- MakeMaker c_o section:
+
+
+# --- MakeMaker xs_c section:
+
+
+# --- MakeMaker xs_o section:
+
+
+# --- MakeMaker top_targets section:
+all :: pure_all manifypods
+	$(NOECHO) $(NOOP)
+
+
+pure_all :: config pm_to_blib subdirs linkext
+	$(NOECHO) $(NOOP)
+
+subdirs :: $(MYEXTLIB)
+	$(NOECHO) $(NOOP)
+
+config :: $(FIRST_MAKEFILE) blibdirs
+	$(NOECHO) $(NOOP)
+
+help :
+	perldoc ExtUtils::MakeMaker
+
+
+# --- MakeMaker blibdirs section:
+blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists
+	$(NOECHO) $(NOOP)
+
+# Backwards compat with 6.18 through 6.25
+blibdirs.ts : blibdirs
+	$(NOECHO) $(NOOP)
+
+$(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_LIBDIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR)
+	$(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists
+
+$(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_ARCHLIB)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB)
+	$(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists
+
+$(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_AUTODIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR)
+	$(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists
+
+$(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR)
+	$(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists
+
+$(INST_BIN)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_BIN)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN)
+	$(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists
+
+$(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_SCRIPT)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT)
+	$(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists
+
+$(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_MAN1DIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR)
+	$(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists
+
+$(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_MAN3DIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR)
+	$(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists
+
+
+
+# --- MakeMaker linkext section:
+
+linkext :: $(LINKTYPE)
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker dlsyms section:
+
+
+# --- MakeMaker dynamic section:
+
+dynamic :: $(FIRST_MAKEFILE) $(INST_DYNAMIC) $(INST_BOOT)
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker dynamic_bs section:
+
+BOOTSTRAP =
+
+
+# --- MakeMaker dynamic_lib section:
+
+
+# --- MakeMaker static section:
+
+## $(INST_PM) has been moved to the all: target.
+## It remains here for awhile to allow for old usage: "make static"
+static :: $(FIRST_MAKEFILE) $(INST_STATIC)
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker static_lib section:
+
+
+# --- MakeMaker manifypods section:
+
+POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--"
+POD2MAN = $(POD2MAN_EXE)
+
+
+manifypods : pure_all  \
+	lib/MaxAs/MaxAs.pm
+	$(NOECHO) $(POD2MAN) --section=3 --perm_rw=$(PERM_RW) \
+	  lib/MaxAs/MaxAs.pm $(INST_MAN3DIR)/MaxAs::MaxAs.$(MAN3EXT) 
+
+
+
+
+# --- MakeMaker processPL section:
+
+
+# --- MakeMaker installbin section:
+
+EXE_FILES = bin/maxas.pl
+
+pure_all :: $(INST_SCRIPT)/maxas.pl
+	$(NOECHO) $(NOOP)
+
+realclean ::
+	$(RM_F) \
+	  $(INST_SCRIPT)/maxas.pl 
+
+$(INST_SCRIPT)/maxas.pl : bin/maxas.pl $(FIRST_MAKEFILE) $(INST_SCRIPT)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists
+	$(NOECHO) $(RM_F) $(INST_SCRIPT)/maxas.pl
+	$(CP) bin/maxas.pl $(INST_SCRIPT)/maxas.pl
+	$(FIXIN) $(INST_SCRIPT)/maxas.pl
+	-$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_SCRIPT)/maxas.pl
+
+
+
+# --- MakeMaker subdirs section:
+
+# none
+
+# --- MakeMaker clean_subdirs section:
+clean_subdirs :
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker clean section:
+
+# Delete temporary files but do not touch installed files. We don't delete
+# the Makefile here so a later make realclean still has a makefile to use.
+
+clean :: clean_subdirs
+	- $(RM_F) \
+	  *$(LIB_EXT) core \
+	  core.[0-9] $(INST_ARCHAUTODIR)/extralibs.all \
+	  core.[0-9][0-9] $(BASEEXT).bso \
+	  pm_to_blib.ts core.[0-9][0-9][0-9][0-9] \
+	  $(BASEEXT).x $(BOOTSTRAP) \
+	  perl$(EXE_EXT) tmon.out \
+	  *$(OBJ_EXT) pm_to_blib \
+	  $(INST_ARCHAUTODIR)/extralibs.ld blibdirs.ts \
+	  core.[0-9][0-9][0-9][0-9][0-9] *perl.core \
+	  core.*perl.*.? $(MAKE_APERL_FILE) \
+	  perl $(BASEEXT).def \
+	  core.[0-9][0-9][0-9] mon.out \
+	  lib$(BASEEXT).def perlmain.c \
+	  perl.exe so_locations \
+	  $(BASEEXT).exp 
+	- $(RM_RF) \
+	  blib 
+	- $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL)
+
+
+# --- MakeMaker realclean_subdirs section:
+realclean_subdirs :
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker realclean section:
+# Delete temporary files (via clean) and also delete dist files
+realclean purge ::  clean realclean_subdirs
+	- $(RM_F) \
+	  $(MAKEFILE_OLD) $(FIRST_MAKEFILE) 
+	- $(RM_RF) \
+	  $(DISTVNAME) 
+
+
+# --- MakeMaker metafile section:
+metafile : create_distdir
+	$(NOECHO) $(ECHO) Generating META.yml
+	$(NOECHO) $(ECHO) '--- #YAML:1.0' > META_new.yml
+	$(NOECHO) $(ECHO) 'name:               MaxAs-MaxAs' >> META_new.yml
+	$(NOECHO) $(ECHO) 'version:            1.06' >> META_new.yml
+	$(NOECHO) $(ECHO) 'abstract:           Assembler for NVIDIA Maxwell architecture' >> META_new.yml
+	$(NOECHO) $(ECHO) 'author:' >> META_new.yml
+	$(NOECHO) $(ECHO) '    - Scott Gray <sgray@nervanasys.com>' >> META_new.yml
+	$(NOECHO) $(ECHO) 'license:            MIT' >> META_new.yml
+	$(NOECHO) $(ECHO) 'distribution_type:  module' >> META_new.yml
+	$(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml
+	$(NOECHO) $(ECHO) '    ExtUtils::MakeMaker:  0' >> META_new.yml
+	$(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml
+	$(NOECHO) $(ECHO) '    ExtUtils::MakeMaker:  0' >> META_new.yml
+	$(NOECHO) $(ECHO) 'requires:' >> META_new.yml
+	$(NOECHO) $(ECHO) '    Carp:          1.29' >> META_new.yml
+	$(NOECHO) $(ECHO) '    Data::Dumper:  2.145' >> META_new.yml
+	$(NOECHO) $(ECHO) 'no_index:' >> META_new.yml
+	$(NOECHO) $(ECHO) '    directory:' >> META_new.yml
+	$(NOECHO) $(ECHO) '        - t' >> META_new.yml
+	$(NOECHO) $(ECHO) '        - inc' >> META_new.yml
+	$(NOECHO) $(ECHO) 'generated_by:       ExtUtils::MakeMaker version 6.55_02' >> META_new.yml
+	$(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml
+	$(NOECHO) $(ECHO) '    url:      http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml
+	$(NOECHO) $(ECHO) '    version:  1.4' >> META_new.yml
+	-$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml
+
+
+# --- MakeMaker signature section:
+signature :
+	cpansign -s
+
+
+# --- MakeMaker dist_basics section:
+distclean :: realclean distcheck
+	$(NOECHO) $(NOOP)
+
+distcheck :
+	$(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck
+
+skipcheck :
+	$(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck
+
+manifest :
+	$(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest
+
+veryclean : realclean
+	$(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old 
+
+
+
+# --- MakeMaker dist_core section:
+
+dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE)
+	$(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \
+	  -e '    if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' --
+
+tardist : $(DISTVNAME).tar$(SUFFIX)
+	$(NOECHO) $(NOOP)
+
+uutardist : $(DISTVNAME).tar$(SUFFIX)
+	uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu
+
+$(DISTVNAME).tar$(SUFFIX) : distdir
+	$(PREOP)
+	$(TO_UNIX)
+	$(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME)
+	$(RM_RF) $(DISTVNAME)
+	$(COMPRESS) $(DISTVNAME).tar
+	$(POSTOP)
+
+zipdist : $(DISTVNAME).zip
+	$(NOECHO) $(NOOP)
+
+$(DISTVNAME).zip : distdir
+	$(PREOP)
+	$(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME)
+	$(RM_RF) $(DISTVNAME)
+	$(POSTOP)
+
+shdist : distdir
+	$(PREOP)
+	$(SHAR) $(DISTVNAME) > $(DISTVNAME).shar
+	$(RM_RF) $(DISTVNAME)
+	$(POSTOP)
+
+
+# --- MakeMaker distdir section:
+create_distdir :
+	$(RM_RF) $(DISTVNAME)
+	$(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \
+		-e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');"
+
+distdir : create_distdir distmeta 
+	$(NOECHO) $(NOOP)
+
+
+
+# --- MakeMaker dist_test section:
+disttest : distdir
+	cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL 
+	cd $(DISTVNAME) && $(MAKE) $(PASTHRU)
+	cd $(DISTVNAME) && $(MAKE) test $(PASTHRU)
+
+
+
+# --- MakeMaker dist_ci section:
+
+ci :
+	$(PERLRUN) "-MExtUtils::Manifest=maniread" \
+	  -e "@all = keys %{ maniread() };" \
+	  -e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \
+	  -e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});"
+
+
+# --- MakeMaker distmeta section:
+distmeta : create_distdir metafile
+	$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{META.yml} => q{Module meta-data (added by MakeMaker)}}) } ' \
+	  -e '    or print "Could not add META.yml to MANIFEST: $${'\''@'\''}\n"' --
+
+
+
+# --- MakeMaker distsignature section:
+distsignature : create_distdir
+	$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) } ' \
+	  -e '    or print "Could not add SIGNATURE to MANIFEST: $${'\''@'\''}\n"' --
+	$(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE
+	cd $(DISTVNAME) && cpansign -s
+
+
+
+# --- MakeMaker install section:
+
+install :: pure_install doc_install
+	$(NOECHO) $(NOOP)
+
+install_perl :: pure_perl_install doc_perl_install
+	$(NOECHO) $(NOOP)
+
+install_site :: pure_site_install doc_site_install
+	$(NOECHO) $(NOOP)
+
+install_vendor :: pure_vendor_install doc_vendor_install
+	$(NOECHO) $(NOOP)
+
+pure_install :: pure_$(INSTALLDIRS)_install
+	$(NOECHO) $(NOOP)
+
+doc_install :: doc_$(INSTALLDIRS)_install
+	$(NOECHO) $(NOOP)
+
+pure__install : pure_site_install
+	$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
+
+doc__install : doc_site_install
+	$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
+
+pure_perl_install :: all
+	$(NOECHO) $(MOD_INSTALL) \
+		read $(PERL_ARCHLIB)/auto/$(FULLEXT)/.packlist \
+		write $(DESTINSTALLARCHLIB)/auto/$(FULLEXT)/.packlist \
+		$(INST_LIB) $(DESTINSTALLPRIVLIB) \
+		$(INST_ARCHLIB) $(DESTINSTALLARCHLIB) \
+		$(INST_BIN) $(DESTINSTALLBIN) \
+		$(INST_SCRIPT) $(DESTINSTALLSCRIPT) \
+		$(INST_MAN1DIR) $(DESTINSTALLMAN1DIR) \
+		$(INST_MAN3DIR) $(DESTINSTALLMAN3DIR)
+	$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
+		$(SITEARCHEXP)/auto/$(FULLEXT)
+
+
+pure_site_install :: all
+	$(NOECHO) $(MOD_INSTALL) \
+		read $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist \
+		write $(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist \
+		$(INST_LIB) $(DESTINSTALLSITELIB) \
+		$(INST_ARCHLIB) $(DESTINSTALLSITEARCH) \
+		$(INST_BIN) $(DESTINSTALLSITEBIN) \
+		$(INST_SCRIPT) $(DESTINSTALLSITESCRIPT) \
+		$(INST_MAN1DIR) $(DESTINSTALLSITEMAN1DIR) \
+		$(INST_MAN3DIR) $(DESTINSTALLSITEMAN3DIR)
+	$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
+		$(PERL_ARCHLIB)/auto/$(FULLEXT)
+
+pure_vendor_install :: all
+	$(NOECHO) $(MOD_INSTALL) \
+		read $(VENDORARCHEXP)/auto/$(FULLEXT)/.packlist \
+		write $(DESTINSTALLVENDORARCH)/auto/$(FULLEXT)/.packlist \
+		$(INST_LIB) $(DESTINSTALLVENDORLIB) \
+		$(INST_ARCHLIB) $(DESTINSTALLVENDORARCH) \
+		$(INST_BIN) $(DESTINSTALLVENDORBIN) \
+		$(INST_SCRIPT) $(DESTINSTALLVENDORSCRIPT) \
+		$(INST_MAN1DIR) $(DESTINSTALLVENDORMAN1DIR) \
+		$(INST_MAN3DIR) $(DESTINSTALLVENDORMAN3DIR)
+
+doc_perl_install :: all
+	$(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLARCHLIB)/perllocal.pod
+	-$(NOECHO) $(MKPATH) $(DESTINSTALLARCHLIB)
+	-$(NOECHO) $(DOC_INSTALL) \
+		"Module" "$(NAME)" \
+		"installed into" "$(INSTALLPRIVLIB)" \
+		LINKTYPE "$(LINKTYPE)" \
+		VERSION "$(VERSION)" \
+		EXE_FILES "$(EXE_FILES)" \
+		>> $(DESTINSTALLARCHLIB)/perllocal.pod
+
+doc_site_install :: all
+	$(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLARCHLIB)/perllocal.pod
+	-$(NOECHO) $(MKPATH) $(DESTINSTALLARCHLIB)
+	-$(NOECHO) $(DOC_INSTALL) \
+		"Module" "$(NAME)" \
+		"installed into" "$(INSTALLSITELIB)" \
+		LINKTYPE "$(LINKTYPE)" \
+		VERSION "$(VERSION)" \
+		EXE_FILES "$(EXE_FILES)" \
+		>> $(DESTINSTALLARCHLIB)/perllocal.pod
+
+doc_vendor_install :: all
+	$(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLARCHLIB)/perllocal.pod
+	-$(NOECHO) $(MKPATH) $(DESTINSTALLARCHLIB)
+	-$(NOECHO) $(DOC_INSTALL) \
+		"Module" "$(NAME)" \
+		"installed into" "$(INSTALLVENDORLIB)" \
+		LINKTYPE "$(LINKTYPE)" \
+		VERSION "$(VERSION)" \
+		EXE_FILES "$(EXE_FILES)" \
+		>> $(DESTINSTALLARCHLIB)/perllocal.pod
+
+
+uninstall :: uninstall_from_$(INSTALLDIRS)dirs
+	$(NOECHO) $(NOOP)
+
+uninstall_from_perldirs ::
+	$(NOECHO) $(UNINSTALL) $(PERL_ARCHLIB)/auto/$(FULLEXT)/.packlist
+
+uninstall_from_sitedirs ::
+	$(NOECHO) $(UNINSTALL) $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist
+
+uninstall_from_vendordirs ::
+	$(NOECHO) $(UNINSTALL) $(VENDORARCHEXP)/auto/$(FULLEXT)/.packlist
+
+
+# --- MakeMaker force section:
+# Phony target to force checking subdirectories.
+FORCE :
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker perldepend section:
+
+
+# --- MakeMaker makefile section:
+# We take a very conservative approach here, but it's worth it.
+# We move Makefile to Makefile.old here to avoid gnu make looping.
+$(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP)
+	$(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?"
+	$(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..."
+	-$(NOECHO) $(RM_F) $(MAKEFILE_OLD)
+	-$(NOECHO) $(MV)   $(FIRST_MAKEFILE) $(MAKEFILE_OLD)
+	- $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL)
+	$(PERLRUN) Makefile.PL 
+	$(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <=="
+	$(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command.  <=="
+	$(FALSE)
+
+
+
+# --- MakeMaker staticmake section:
+
+# --- MakeMaker makeaperl section ---
+MAP_TARGET    = perl
+FULLPERL      = /usr/bin/perl
+
+$(MAP_TARGET) :: static $(MAKE_APERL_FILE)
+	$(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@
+
+$(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib
+	$(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET)
+	$(NOECHO) $(PERLRUNINST) \
+		Makefile.PL DIR= \
+		MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \
+		MAKEAPERL=1 NORECURS=1 CCCDLFLAGS=
+
+
+# --- MakeMaker test section:
+
+TEST_VERBOSE=0
+TEST_TYPE=test_$(LINKTYPE)
+TEST_FILE = test.pl
+TEST_FILES = t/*.t
+TESTDB_SW = -d
+
+testdb :: testdb_$(LINKTYPE)
+
+test :: $(TEST_TYPE) subdirs-test
+
+subdirs-test ::
+	$(NOECHO) $(NOOP)
+
+
+test_dynamic :: pure_all
+	PERL_DL_NONLAZY=1 $(FULLPERLRUN) "-MExtUtils::Command::MM" "-e" "test_harness($(TEST_VERBOSE), '$(INST_LIB)', '$(INST_ARCHLIB)')" $(TEST_FILES)
+
+testdb_dynamic :: pure_all
+	PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
+
+test_ : test_dynamic
+
+test_static :: test_dynamic
+testdb_static :: testdb_dynamic
+
+
+# --- MakeMaker ppd section:
+# Creates a PPD (Perl Package Description) for a binary distribution.
+ppd :
+	$(NOECHO) $(ECHO) '<SOFTPKG NAME="$(DISTNAME)" VERSION="1.06">' > $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    <ABSTRACT>Assembler for NVIDIA Maxwell architecture</ABSTRACT>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    <AUTHOR>Scott Gray &lt;sgray@nervanasys.com&gt;</AUTHOR>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    <IMPLEMENTATION>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <REQUIRE NAME="Carp::" VERSION="1.29" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <REQUIRE NAME="Data::Dumper" VERSION="2.145" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <ARCHITECTURE NAME="x86_64-linux-thread-multi-5.10" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <CODEBASE HREF="" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    </IMPLEMENTATION>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '</SOFTPKG>' >> $(DISTNAME).ppd
+
+
+# --- MakeMaker pm_to_blib section:
+
+pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM)
+	$(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \
+	  lib/MaxAs/MaxAs.pm blib/lib/MaxAs/MaxAs.pm \
+	  lib/MaxAs/Cubin.pm blib/lib/MaxAs/Cubin.pm \
+	  lib/MaxAs/MaxAsGrammar.pm blib/lib/MaxAs/MaxAsGrammar.pm 
+	$(NOECHO) $(TOUCH) pm_to_blib
+
+
+# --- MakeMaker selfdocument section:
+
+
+# --- MakeMaker postamble section:
+
+
+# End.
diff --git a/Assembler/MaxAs/Makefile.PL b/Assembler/MaxAs/Makefile.PL
new file mode 100644
index 0000000..4be8ccf
--- /dev/null
+++ b/Assembler/MaxAs/Makefile.PL
@@ -0,0 +1,14 @@
+require 5.10.0;
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+    NAME              => 'MaxAs::MaxAs',
+    VERSION_FROM      => 'lib/MaxAs/MaxAs.pm', # finds $VERSION
+    EXE_FILES         => ['bin/maxas.pl'],
+    PREREQ_PM         => {Carp => 1.29, Data::Dumper => 2.145},
+    LICENSE           => 'MIT',
+    ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
+      (ABSTRACT_FROM  => 'lib/MaxAs/MaxAs.pm', # retrieve abstract from module
+       AUTHOR         => 'Scott Gray <sgray@nervanasys.com>') : ()),
+);
diff --git a/Assembler/MaxAs/README.md b/Assembler/MaxAs/README.md
new file mode 100644
index 0000000..318aba8
--- /dev/null
+++ b/Assembler/MaxAs/README.md
@@ -0,0 +1,28 @@
+# MaxAs
+Assembler for NVIDIA Maxwell architecture
+
+To install (system-wide):
+
+    sudo cpanm git://github.com/NervanaSystems/maxas.git
+
+or
+
+    perl Makefile.PL
+    make
+    sudo make install
+
+
+See wiki pages for more information:
+
+- [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction)
+- [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started)
+- [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes)
+- [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM)
+
+Related work with lots of additional shader assembly (sass) examples:
+
+- [NervanaGPU](https://github.com/NervanaSystems/nervanagpu)
+
+This project is released under the [MIT License](http://opensource.org/licenses/MIT).
+
+-- Scott Gray
diff --git a/Assembler/MaxAs/bin/maxas.pl b/Assembler/MaxAs/bin/maxas.pl
new file mode 100755
index 0000000..55e4241
--- /dev/null
+++ b/Assembler/MaxAs/bin/maxas.pl
@@ -0,0 +1,314 @@
+#!/usr/bin/perl
+use strict;
+use MaxAs::Cubin;
+use MaxAs::MaxAs;
+use Data::Dumper;
+use File::Spec;
+
+require 5.10.0;
+
+$Data::Dumper::Sortkeys = 1;
+
+my $mode = shift;
+
+# List cubin contents
+if ($mode =~ /^\-?\-l/i)
+{
+    my $cubinFile = shift or usage();
+
+    my $cubin = MaxAs::Cubin->new($cubinFile);
+
+    my $arch    = $cubin->arch;
+    my $class   = $cubin->class;
+    my $asize   = $cubin->address_size;
+    my $kernels = $cubin->listKernels;
+    my $symbols = $cubin->listSymbols;
+
+    printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize;
+
+    foreach my $ker (sort keys %$kernels)
+    {
+        printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)};
+    }
+    foreach my $sym (sort keys %$symbols)
+    {
+        printf "Symbol: %s\n", $sym;
+    }
+}
+# Test that the assembler can reproduce the op codes this cubin or sass contains
+elsif ($mode =~ /^\-?\-t/i)
+{
+    my $reg  = shift if $ARGV[0] =~ /^\-?\-r/i;
+    my $all  = shift if $ARGV[0] =~ /^\-?\-a/i;
+    my $file = shift or usage();
+    my $fh;
+    # sass file
+    if (-T $file)
+    {
+        open $fh, $file or die "$file: $!";
+    }
+    # cubin file
+    else
+    {
+        my $cubin = MaxAs::Cubin->new($file);
+        my $arch  = $cubin->arch;
+
+        open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!";
+        my $first = <$fh>;
+        if ($first =~ /cuobjdump fatal/)
+        {
+            print $first;
+            exit(1);
+        }
+    }
+    exit(MaxAs::MaxAs::Test($fh, $reg, $all) ? 1 : 0);
+}
+# Extract an asm file containing the desired kernel
+elsif ($mode =~ /^\-?\-e/i)
+{
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $cubinFile = shift or usage();
+    my $asmFile   = shift;
+    my $cubin     = MaxAs::Cubin->new($cubinFile);
+    my $arch      = $cubin->arch;
+    my $kernels   = $cubin->listKernels;
+
+    #default the kernel name if not specified.
+    $kernelName ||= (sort keys %$kernels)[0];
+
+    my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName";
+
+    open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_50 -sass -fun $kernelName $cubinFile: $!";
+    my $first = <$in>;
+    if ($first =~ /cuobjdump fatal/)
+    {
+        print $first;
+        exit(1);
+    }
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n";
+
+    print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt));
+
+    print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n";
+
+    print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params};
+
+    print $out "#\n# Instructions:\n\n";
+
+    MaxAs::MaxAs::Extract($in, $out, $kernel->{Params});
+
+    close $out if $asmFile;
+    close $in;
+}
+# Extract a kernel from a sass dump
+elsif ($mode =~ /^\-?\-s/i)
+{
+    my $sassFile  = shift or usage();
+    my $asmFile   = shift;
+
+    open my $in, $sassFile or die "$sassFile: $!";
+
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    MaxAs::MaxAs::Extract($in, $out, []);
+
+    close $out if $asmFile;
+    close $in;
+}
+# Insert the kernel asm back into the cubin:
+elsif ($mode =~ /^\-?\-i/i)
+{
+    my $nowarn;
+    if ($ARGV[0] =~ /^\-?\-w/i)
+    {
+        $nowarn = shift;
+    }
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $noReuse   = shift if $ARGV[0] =~ /^\-?\-n/i;
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';"
+    }
+
+    my $asmFile   = shift or usage();
+    my $cubinFile = shift or usage();
+    my $newCubin  = shift || $cubinFile;
+
+    my $file;
+    if (open my $fh, $asmFile)
+    {
+        local $/;
+        $file = <$fh>;
+        close $fh;
+    }
+    else { die "$asmFile: $!" }
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    # extract the kernel name from the file
+    ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName;
+    die "asm file missing kernel name or is badly formatted" unless $kernelName;
+
+    my $kernel = MaxAs::MaxAs::Assemble($file, $include, !$noReuse, $nowarn);
+
+    my $cubin  = MaxAs::Cubin->new($cubinFile);
+    $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName";
+
+    $cubin->modifyKernel(%$kernel);
+
+    $cubin->write($newCubin);
+
+    printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n",
+        @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)};
+
+}
+# Preprocessing:
+elsif ($mode =~ /^\-?\-p/i)
+{
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';";
+    }
+    my $debug     = shift if $ARGV[0] =~ /^\-?\-d/i;
+    my $asmFile   = shift or usage();
+    my $asmFile2  = shift;
+
+    die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2;
+
+    open my $fh,  $asmFile or die "$asmFile: $!";
+    local $/;
+    my $file = <$fh>;
+    close $fh;
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    if ($asmFile2)
+    {
+        open $fh, ">$asmFile2" or die "$asmFile2: $!";
+    }
+    else
+    {
+        $fh = \*STDOUT;
+    }
+    print $fh MaxAs::MaxAs::Preprocess($file, $include, $debug);
+    close $fh;
+}
+#Analyzing
+elsif ($mode =~ /^\-?\-a/i) 
+{
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';";
+    }
+    my $analyze     = shift if $ARGV[0] =~ /^\-?\-analyze/i;
+    my $config     = shift or usage();
+    my $asmFile   = shift or usage();
+
+    open my $fh,  $asmFile or die "$asmFile: $!";
+    local $/;
+    my $file = <$fh>;
+    close $fh;
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    MaxAs::MaxAs::Occupancy($config);
+    MaxAs::MaxAs::Analyze($file, $include);
+}
+# get version information
+elsif ($mode =~ /^\-?\-v/i)
+{
+    print "$MaxAs::MaxAs::VERSION\n";
+}
+else
+{
+    print "$mode\n";
+    usage();
+}
+
+exit(0);
+
+sub usage
+{
+    print <<EOF;
+Usage:
+
+  List kernels and symbols:
+
+    maxas.pl --list|-l <cubin_file>
+
+  Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes.
+  Also useful for extending the missing grammar rules.  Defaults to only showing failures without --all.
+  With the --reg flag it will show register bank conflicts not hidden by reuse flags.
+
+    maxas.pl --test|-t [--reg|-r] [--all|-a] <cubin_file | cuobjdump_sass_file>
+
+  Extract a single kernel into an asm file from a cubin.
+  Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin.
+
+    maxas.pl --extract|-e [--kernel|-k kernel_name] <cubin_file> [asm_file]
+
+  Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes.
+  Include the debug flag to print out detailed scheduler info.
+
+    maxas.pl --pre|-p [--debug|-d] <asm_file> [new_asm_file]
+
+  Insert the kernel asm back into the cubin.  Overwrite existing or create new cubin.
+  Optionally you can skip register reuse flag auto insertion.  This allows you to observe
+  performance without any reuse or you can use it to set the flags manually in your sass.
+
+    maxas.pl --insert|-i [--noreuse|-n] <asm_file> <cubin_file> [new_cubin_file]
+
+  Analyze each blocks in the assembly codes. Specify each instruction's efficiency, predict a block's
+  running cycles, and point out codes bottlenecks.
+
+    maxas.pl --analyze|-a <config_file> <asm_file> [result_file]
+
+  Display version information and exit:
+
+    maxas.pl --version|-v
+
+EOF
+    exit(1);
+}
+
+__END__
diff --git a/Assembler/MaxAs/blib/arch/.exists b/Assembler/MaxAs/blib/arch/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/blib/arch/auto/MaxAs/MaxAs/.exists b/Assembler/MaxAs/blib/arch/auto/MaxAs/MaxAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/blib/bin/.exists b/Assembler/MaxAs/blib/bin/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/blib/lib/MaxAs/.exists b/Assembler/MaxAs/blib/lib/MaxAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/blib/lib/MaxAs/Cubin.pm b/Assembler/MaxAs/blib/lib/MaxAs/Cubin.pm
new file mode 100644
index 0000000..5900958
--- /dev/null
+++ b/Assembler/MaxAs/blib/lib/MaxAs/Cubin.pm
@@ -0,0 +1,684 @@
+package MaxAs::Cubin;
+
+use strict;
+use Data::Dumper;
+
+my @Elf32_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    L   entry
+    L   phOffset
+    L   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf64_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    Q   entry
+    Q   phOffset
+    Q   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf32_PrgHdr = qw(
+    L   type
+    L   offset
+    L   vaddr
+    L   paddr
+    L   fileSize
+    L   memSize
+    L   flags
+    L   align
+);
+my @Elf64_PrgHdr = qw(
+    L   type
+    L   flags
+    Q   offset
+    Q   vaddr
+    Q   paddr
+    Q   fileSize
+    Q   memSize
+    Q   align
+);
+my @Elf32_SecHdr = qw(
+    L   name
+    L   type
+    L   flags
+    L   addr
+    L   offset
+    L   size
+    L   link
+    L   info
+    L   align
+    L   entSize
+);
+my @Elf64_SecHdr = qw(
+    L   name
+    L   type
+    Q   flags
+    Q   addr
+    Q   offset
+    Q   size
+    L   link
+    L   info
+    Q   align
+    Q   entSize
+);
+my @Elf32_SymEnt = qw(
+    L   name
+    L   value
+    L   size
+    C   info
+    C   other
+    S   shIndx
+);
+my @Elf64_SymEnt = qw(
+    L   name
+    C   info
+    C   other
+    S   shIndx
+    Q   value
+    Q   size
+);
+my @symBind = qw(LOCAL GLOBAL WEAK);
+
+# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C)
+my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC);
+
+$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr;
+$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr;
+$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr;
+$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt;
+
+$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr;
+$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr;
+$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr;
+$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt;
+
+$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr    ];
+$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ];
+$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ];
+$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ];
+
+$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr    ];
+$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ];
+$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ];
+$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ];
+
+# Load a cubin ELF file
+sub new
+{
+    my ($package, $file) = @_;
+
+    my $cubin = bless { fileName => $file }, $package;
+
+    open my $fh, $file or die "$file: $!";
+    binmode($fh);
+
+    # Read in assuming 32 bit header
+    my $data;
+    read $fh, $data, 0x34;
+    my $elfHdr = $cubin->{elfHdr} = {};
+    @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data;
+
+    # 1: 32bit, 2: 64bit
+    my $class = $elfHdr->{fileClass};
+
+    # re-read in with 64 bit header if needed
+    if ($class == 2)
+    {
+        seek $fh, 0, 0;
+        read $fh, $data, 0x46;
+        @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data;
+
+        $cubin->{Class} = 64;
+    }
+    else
+    {
+        $cubin->{Class} = 32;
+    }
+
+    # verify sm_50 cubin
+    $cubin->{Arch} = $elfHdr->{flags} & 0xFF;
+    die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50;
+
+    $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32;
+
+    # Read in Program Headers
+    seek $fh, $elfHdr->{phOffset}, 0;
+    foreach (1 .. $elfHdr->{phNum})
+    {
+        read $fh, $data, $elfHdr->{phEntSize};
+
+        my %prgHdr = (Indx => $_ - 1);
+        @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data;
+        push @{$cubin->{prgHdrs}}, \%prgHdr;
+    }
+
+    # Read in Section Headers
+    seek $fh, $elfHdr->{shOffset}, 0;
+    foreach (1 .. $elfHdr->{shNum})
+    {
+        read $fh, $data, $elfHdr->{shEntSize};
+
+        my %secHdr = (Indx => $_ - 1);
+        @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data;
+        push @{$cubin->{secHdrs}}, \%secHdr;
+    }
+
+    # Read in Section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $data = '';
+        # Skip sections with no data (type NULL or NOBITS)
+        if ($secHdr->{size} && $secHdr->{type} != 8)
+        {
+            seek $fh, $secHdr->{offset}, 0;
+            read $fh, $data, $secHdr->{size};
+        }
+        # Convert string tables to maps
+        if ($secHdr->{type} == 3) # STRTAB
+        {
+            my $strTab = $secHdr->{StrTab} = {};
+            my $indx   = 0;
+            foreach my $str (split "\0", $data)
+            {
+                $strTab->{$indx} = $str;
+                $indx += 1 + length($str);
+            }
+        }
+        # Read in Symbol data
+        if ($secHdr->{type} == 2) # SYMTAB
+        {
+            my $offset = 0;
+            while ($offset < $secHdr->{size})
+            {
+                my $symEnt = {};
+                @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize});
+                $offset += $secHdr->{entSize};
+
+                push @{$secHdr->{SymTab}}, $symEnt;
+            }
+        }
+        # Cache raw data for further processing and writing
+        $secHdr->{Data} = unpack 'H*', $data;
+    }
+    close $fh;
+
+    # Update section headers with their names.  Map names directly to headers.
+    my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab};
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $secHdr->{Name} = $shStrTab->{$secHdr->{name}};
+        $cubin->{$secHdr->{Name}} = $secHdr;
+    }
+
+    # Update symbols with their names
+    # For the Global functions, extract kernel meta data
+    # Populate the kernel hash
+    my $strTab = $cubin->{'.strtab'}{StrTab};
+    foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}})
+    {
+        $symEnt->{Name} = $strTab->{$symEnt->{name}};
+
+        # Attach symbol to section
+        my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}];
+        $secHdr->{SymbolEnt} = $symEnt;
+
+        # Look for symbols tagged FUNC
+        if (($symEnt->{info} & 0x0f) == 0x02)
+        {
+            # Create a hash of kernels for output
+            my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr;
+
+            # Extract local/global/weak binding info
+            $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4];
+
+            # Extract the kernel instructions
+            $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ];
+
+            # Extract the max barrier resource identifier used and add 1. Should be 0-16.
+            # If a register is used as a barrier resource id, then this value is the max of 16.
+            $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20;
+
+            # Extract the number of allocated registers for this kernel.
+            $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24;
+
+            # Extract the size of shared memory this kernel uses.
+            my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"};
+            $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0;
+
+            # Attach constant0 section
+            $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"};
+
+            # Extract the kernel parameter data.
+            my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"};
+            if ($paramSec)
+            {
+                # Extract raw param data
+                my @data = unpack "L*", pack "H*", $paramSec->{Data};
+
+                $paramSec->{ParamData} = \@data;
+                $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ];
+
+                # Find the first param delimiter
+                my $idx = 0;
+                $idx++ while $idx < @data && $data[$idx] != 0x00080a04;
+
+                my $first = $data[$idx+2] & 0xFFFF;
+                #my $size  = $data[$idx+2] >> 16;
+                $idx += 4;
+
+                my @params;
+                while ($idx < @data && $data[$idx] == 0x000c1704)
+                {
+                    # Get the ordinal, offset, size and pointer alignment for each param
+                    my $ord    = $data[$idx+2] & 0xFFFF;
+                    my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16);
+                    my $psize  = $data[$idx+3] >> 18;
+                    my $align  = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0;
+                    unshift @params, "$ord:$offset:$psize:$align";
+                    $idx += 4;
+                }
+                my @staticParams = @data[0 .. ($idx-1)];
+
+                my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize);
+                while ($idx < @data)
+                {
+                    my $code = $data[$idx] & 0xffff;
+                    my $size = $data[$idx] >> 16;
+                    $idx++;
+
+                    # EIATTR_MAXREG_COUNT
+                    if ($code == 0x1b03)
+                    {
+                        $maxregCount = $size;
+                    }
+                    # EIATTR_S2RCTAID_INSTR_OFFSETS
+                    elsif ($code == 0x1d04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @ctaidOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_EXIT_INSTR_OFFSETS
+                    elsif ($code == 0x1c04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @exitOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CTAIDZ_USED
+                    elsif ($code == 0x0401)
+                    {
+                        $ctaidzUsed = 1;
+                    }
+                    # EIATTR_REQNTID
+                    elsif ($code == 0x1004)
+                    {
+                        while ($size > 0)
+                        {
+                            push @reqntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_MAX_THREADS
+                    elsif ($code == 0x0504)
+                    {
+                        while ($size > 0)
+                        {
+                            push @maxntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CRS_STACK_SIZE
+                    elsif ($code == 0x1e04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @stackSize, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    else
+                    {
+                        printf STDERR "Unknown Code 0x%02x (size:%d)\n", $code, $size;
+                    }
+                }
+                $kernelSec->{Params}   = \@params;
+                $kernelSec->{ParamCnt} = scalar @params;
+
+                $paramSec->{StaticParams} = \@staticParams;
+                $paramSec->{MAXREG_COUNT} = $maxregCount;
+                $paramSec->{ExitOffsets}  = \@exitOffsets;
+                $paramSec->{CTAIDOffsets} = \@ctaidOffsets;
+                $paramSec->{CTAIDZUsed}   = $ctaidzUsed;
+                $paramSec->{REQNTID}      = \@reqntid;
+                $paramSec->{MAXNTID}      = \@maxntid;
+                $paramSec->{STACKSIZE}    = \@stackSize;
+            }
+            # print Dumper($paramSec);
+            # exit();
+        }
+        # Note GLOBALs found in this cubin
+        elsif (($symEnt->{info} & 0x10) == 0x10)
+        {
+            $cubin->{Symbols}{$symEnt->{Name}} = $symEnt;
+        }
+    }
+
+    # print "phOffset: $elfHdr->{phOffset}\n";
+    # print "shOffset: $elfHdr->{shOffset}\n";
+    # foreach my $secHdr (@{$cubin->{secHdrs}})
+    # {
+    #     print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n";
+    # }
+    # my $p = 0;
+    # foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    # {
+    #     print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n";
+    #     $p++;
+    # }
+    # exit();
+
+    # print Dumper($cubin->{prgHdrs});
+    # exit();
+    return $cubin;
+}
+sub class
+{
+    return shift()->{Class};
+}
+sub arch
+{
+    return shift()->{Arch};
+}
+sub address_size
+{
+    return shift()->{AddressSize};
+}
+sub listKernels
+{
+    return shift()->{Kernels};
+}
+sub listSymbols
+{
+    return shift()->{Symbols};
+}
+sub getKernel
+{
+    my ($cubin, $kernel) = @_;
+    return $cubin->{Kernels}{$kernel};
+}
+
+sub modifyKernel
+{
+    my ($cubin, %params) = @_;
+
+    my $kernelSec    = $params{Kernel};
+    my $newReg       = $params{RegCnt};
+    my $newBar       = $params{BarCnt};
+    my $exitOffsets  = $params{ExitOffsets};
+    my $ctaidOffsets = $params{CTAIDOffsets};
+    my $ctaidzUsed   = $params{CTAIDZUsed};
+    my $newData      = $params{KernelData};
+    my $newSize      = @$newData * 8;
+
+    die "255 register max" if $newReg > 255;
+    die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63;
+    die "16 is max barrier count" if $newBar > 16;
+
+    my $paramSec    = $kernelSec->{ParamSec};
+    my $kernelName  = $kernelSec->{SymbolEnt}{Name};
+    my $maxregCount = $paramSec->{MAXREG_COUNT};
+    my $stackSize   = $paramSec->{STACKSIZE};
+
+    # update the kernel
+    $kernelSec->{KernelData} = $newData;
+    $kernelSec->{Data}       = unpack "H*", pack "Q*", @$newData;
+
+    if ($newReg != $kernelSec->{RegCnt})
+    {
+        print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n";
+        $kernelSec->{RegCnt} = $newReg;
+        $kernelSec->{info}  &= ~0xff000000;
+        $kernelSec->{info}  |= $newReg << 24;
+    }
+    if ($newBar != $kernelSec->{BarCnt})
+    {
+        print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n";
+        $kernelSec->{BarCnt} = $newBar;
+        $kernelSec->{flags} &= ~0x01f00000;
+        $kernelSec->{flags} |=  $newBar << 20;
+    }
+
+    my @paramData = @{$paramSec->{StaticParams}};
+
+    if (defined $maxregCount)
+    {
+        push @paramData, ($maxregCount << 16) | 0x1b03;
+    }
+
+    my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets;
+    my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}};
+
+    if ($newCTAIDs ne $oldCTAIDs)
+    {
+        print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n";
+    }
+    if (@$ctaidOffsets)
+    {
+        push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04;
+        push @paramData, @$ctaidOffsets;
+    }
+
+    my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets;
+    my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}};
+
+    if ($newExits ne $oldExits)
+    {
+        print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n";
+    }
+    if (@$exitOffsets)
+    {
+        push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04;
+        push @paramData, @$exitOffsets;
+    }
+
+    if ($ctaidzUsed != $paramSec->{CTAIDZUsed})
+    {
+        print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n";
+    }
+    if ($ctaidzUsed)
+    {
+        push @paramData, 0x0401;
+    }
+
+    if (@{$paramSec->{REQNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004;
+        push @paramData, @{$paramSec->{REQNTID}};
+    }
+    if (@{$paramSec->{MAXNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504;
+        push @paramData, @{$paramSec->{MAXNTID}};
+    }
+
+    if (@$stackSize)
+    {
+        push @paramData, (scalar(@$stackSize) << 18) | 0x1e04;
+        push @paramData, @$stackSize;
+    }
+
+    my $newParamSize  = scalar(@paramData)*4;
+    $paramSec->{Data} = unpack "H*", pack "L*", @paramData;
+    if ($newParamSize != $paramSec->{size})
+    {
+        print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n";
+        $cubin->updateSize($paramSec, $newParamSize);
+    }
+
+    if ($newSize != $kernelSec->{size})
+    {
+        print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n";
+        $cubin->updateSize($kernelSec, $newSize, 1);
+    }
+}
+
+sub updateSize
+{
+    my ($cubin, $sec, $newSize, $updatePrgSize) = @_;
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # update section header
+    my $delta = $newSize - $sec->{size};
+    $sec->{size} = $newSize;
+
+    # update symtab section
+    if ($sec->{SymbolEnt})
+    {
+        $sec->{SymbolEnt}{size} = $newSize;
+        my $symSection = $cubin->{'.symtab'};
+        $symSection->{Data} = '';
+        foreach my $symEnt (@{$symSection->{SymTab}})
+        {
+            $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}};
+        }
+    }
+
+    my $pos = $elfHdr->{ehSize};
+    my %sizeMap;
+
+    # update section header offsets
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # skip first header
+        next if $secHdr->{align} == 0;
+
+        # NOBITS data sections are size 0
+        my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size};
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pos += $secHdr->{align} - $pad;
+        }
+        # map old offset to new
+        $sizeMap{$secHdr->{offset}} = $pos;
+
+        # update offset
+        $secHdr->{offset} = $pos;
+
+        # advance position by size
+        $pos += $size;
+    }
+
+    # compute total section header size
+    my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset};
+
+    # map old offset to new
+    $sizeMap{$elfHdr->{shOffset}} = $pos;
+    $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize;
+
+    $elfHdr->{shOffset} = $pos;
+    $elfHdr->{phOffset} = $pos + $shSize;
+
+    # update program header offsets and sizes
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        # Not sure how best to adjust these so just assume they'll track other offsets.
+        $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}};
+
+        # If the kernel sizes changes, also update the associated ProgramHeader.
+        # Note that this size is the kernel size plus any constant section sizes.
+        if ($updatePrgSize && $prgHdr->{type} == 1 &&
+            $sec->{offset} >= $prgHdr->{offset} &&
+            $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta)
+        {
+            $prgHdr->{fileSize} += $delta;
+            $prgHdr->{memSize}  += $delta;
+        }
+    }
+}
+
+# Write out the cubin after modifying it.
+sub write
+{
+    my ($cubin, $file) = @_;
+
+    open my $fh, ">$file" or die "Error: could not open $file for writing: $!";
+    binmode($fh);
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # write elf header
+    print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}};
+    my $pos = $elfHdr->{ehSize};
+
+    # write section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # Skip NULL and NOBITS data sections
+        next if $secHdr->{size} == 0 || $secHdr->{type} == 8;
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pad = $secHdr->{align} - $pad;
+            print $fh join '', "\0" x $pad;
+            $pos += $pad;
+        }
+
+        print $fh pack 'H*', $secHdr->{Data};
+        $pos += $secHdr->{size};
+    }
+
+    # write section headers
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}};
+    }
+
+    #write program headers
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}};
+    }
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/MaxAs/blib/lib/MaxAs/MaxAs.pm b/Assembler/MaxAs/blib/lib/MaxAs/MaxAs.pm
new file mode 100644
index 0000000..f421cf3
--- /dev/null
+++ b/Assembler/MaxAs/blib/lib/MaxAs/MaxAs.pm
@@ -0,0 +1,1407 @@
+package MaxAs::MaxAs;
+
+require 5.10.0;
+
+use strict;
+use Data::Dumper;
+use MaxAs::MaxAsGrammar;
+use File::Spec;
+use Carp;
+
+our $VERSION = '1.06';
+
+# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+my %relOffset  = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT);
+
+# these ops use absolute addresses
+my %absOffset  = map { $_ => 1 } qw(JCAL);
+
+my %jumpOp     = (%relOffset, %absOffset);
+
+# These instructions use r0 but do not write to r0
+my %noDest     = map { $_ => 1 } qw(ST STG STS STL RED);
+
+# Map register slots to reuse control codes
+my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4);
+
+# Preprocess and Assemble a source file
+sub Assemble
+{
+    my ($file, $include, $doReuse, $nowarn) = @_;
+
+    my $regMap = {};
+    $file = Preprocess($file, $include, 0, $regMap);
+    my $vectors = delete $regMap->{__vectors};
+    my $regBank = delete $regMap->{__regbank};
+
+    # initialize cubin counts
+    my $regCnt = 0;
+    my $barCnt = 0;
+
+    my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse);
+
+    # initialize the first control instruction
+    push @instructs, $ctrl = {};
+
+    foreach my $line (split "\n", $file)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        next unless preProcessLine($line);
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # Save us from crashing the display driver
+            die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n"
+                if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0;
+
+            # track branches/jumps/calls/etc for label remapping
+            push @branches, @instructs+0 if exists $jumpOp{$inst->{op}};
+
+            # push the control code onto the control instruction
+            push @{$ctrl->{ctrl}}, $inst->{ctrl};
+
+            # now point the instruction to its associated control instruction
+            $inst->{ctrl} = $ctrl;
+
+            # add the op name and full instruction text
+            push @instructs, $inst;
+
+            # add a 4th control instruction for every 3 instructions
+            push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            # map the label name to the index of the instruction about to be inserted
+            $labels{$1} = @instructs+0;
+        }
+        else
+        {
+            die "badly formed line at $lineNum: $line\n";
+        }
+    }
+    # add the final BRA op and align the number of instructions to a multiple of 8
+    push @{$ctrl->{ctrl}}, 0x007ff;
+    push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' };
+    while (@instructs & 7)
+    {
+        push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        push @{$ctrl->{ctrl}}, 0x007e0;
+        push @instructs, { op => 'NOP', inst => 'NOP;' };
+    }
+
+    # remap labels
+    foreach my $i (@branches)
+    {
+        if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1})
+            { die "instruction has invalid label: $instructs[$i]{inst}"; }
+
+        $instructs[$i]{jump} = $labels{$1};
+
+        if (exists $relOffset{$instructs[$i]{op}})
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; }
+        else
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; }
+    }
+
+    # calculate optimal register reuse
+    # This effects register bank decisions so do it before analyzing register use
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            if ($doReuse)
+            {
+                # get any vector registers for r0
+                my @r0 = getVecRegisters($vectors, $capData);
+
+                # There are 2 reuse slots per register slot
+                # The reuse hash points to most recent instruction index where register was last used in this slot
+
+                # For writes to a register, clear any reuse opportunity
+                if (@r0 && !exists $noDest{$op})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        if (my $reuse = $reuse{$slot})
+                        {
+                            # if writing with a vector op, clear all linked registers
+                            delete $reuse->{$_} foreach @r0;
+                        }
+                    }
+                }
+                # clear cache if jumping elsewhere
+                %reuse = () if exists $jumpOp{$op};
+
+                # only track register reuse for instruction types this works with
+                if ($gram->{type}{reuse})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        next unless exists $capData->{$slot};
+
+                        my $r = $capData->{$slot};
+                        next if $r eq 'RZ';
+                        next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction
+
+                        my $reuse = $reuse{$slot} ||= {};
+
+                        # if this register was previously marked for potential reuse
+                        if (my $p = $reuse->{$r})
+                        {
+                            # flag the previous instruction's ctrl reuse array slot
+                            $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot};
+
+                            #print "reuse $slot $r $instructs[$p]{inst}\n";
+                        }
+                        # list full, delete the oldest
+                        elsif (keys %$reuse > 2)
+                        {
+                            my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0];
+                            delete $reuse->{$oldest};
+                        }
+                        # mark the new instruction for potential reuse
+                        $reuse->{$r} = $i;
+                    }
+                }
+            }
+            # if reuse is disabled then pull value from code.
+            elsif ($gram->{type}{reuse})
+            {
+                $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData);
+            }
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # Assign registers to requested banks if possible
+    foreach my $r (sort keys %$regBank)
+    {
+        my $bank  = $regBank->{$r};
+        my $avail = $regMap->{$r};
+        foreach my $pos (0 .. $#$avail)
+        {
+            if ($bank == ($avail->[$pos] & 3))
+            {
+                # assign it, while removing the assigned register from the pool
+                $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+                last;
+            }
+        }
+    }
+
+    # calculate register live times and preferred banks for non-fixed registers.
+    # LiveTime only half implemented...
+    my (%liveTime, %pairedBanks, %reuseHistory);
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData   = parseInstruct($inst, $gram) or next;
+            my $reuseType = $gram->{type}{reuse};
+
+            # liveTimes and bank conflicts with source operands
+            my (%addReuse, %delReuse);
+            foreach my $slot (qw(r8 r20 r39))
+            {
+                my $r = $capData->{$slot} or next;
+                next if $r eq 'RZ';
+
+                my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r};
+
+                # All registers should be written prior to being read..
+                if (my $liveTime = $liveTime{$liveR})
+                {
+                    # for each read set the current instruction index as the high value
+                    $liveTime->[$#$liveTime][1] = $i;
+                    push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                }
+                else
+                {
+                    warn "register used without initialization ($r): $inst\n" unless $nowarn;
+                    push @{$liveTime{$liveR}}, [$i,$i];
+                }
+
+                # Is this register active in the reuse cache?
+                my $slotHist  = $reuseHistory{$slot} ||= {};
+                my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0;
+
+                #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3';
+
+                # If this is an auto reg, look at the open banks.
+                # No need to look at banks if this register is in the reuse cache.
+                if (!$selfReuse && ref $regMap->{$r})
+                {
+                    # Look at other source operands in this instruction and flag what banks are being used
+                    foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39))
+                    {
+                        my $r2 = $capData->{$slot2};
+                        next if $r2 eq 'RZ' || $r2 eq $r;
+
+                        my $slotHist2 = $reuseHistory{$slot2} ||= {};
+
+                        #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3';
+
+                        # Dont be concerned with non-reuse type instructions or
+                        # If this operand is in the reuse cache, we don't care what bank it's on.
+                        if (!$reuseType || !exists $slotHist2->{$r2})
+                        {
+                            # if the operand is also an auto-allocated register then link them
+                            # Once we choose the bank for one we want to update that choice for the other register.
+                            if (ref $regMap->{$r2})
+                            {
+                                push @{$pairedBanks{$r}{pairs}}, $r2;
+                                $pairedBanks{$r}{banks} ||= [];
+                            }
+                            # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid.
+                            else
+                            {
+                                my $bank = substr($regMap->{$r2},1) & 3;
+                                #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3';
+
+                                $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++;
+                                $pairedBanks{$r}{pairs} ||= [];
+                            }
+                            # Update the total use count for this register.
+                            # This will be the number of times the register is pulled out of the bank.
+                            $pairedBanks{$r}{useCnt}++;
+                        }
+                    }
+                }
+                # update the reuse history so we know which bank conflicts we can ignore.
+                if ($reuseType)
+                {
+                    # flag these slots for addition or removal from reuseHistory
+                    if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot})
+                        { $addReuse{$slot} = $r; }
+                    else
+                        { $delReuse{$slot} = $r; }
+                }
+            }
+            # update reuse history after we're done with the instruction (when the flag is actually in effect).
+            # we don't want to updated it in the middle since that can interfere with the checks,
+            $reuseHistory{$_}{$addReuse{$_}} = 1    foreach keys %addReuse;
+            delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse;
+
+            # liveTimes for destination operands and vector registers
+            foreach my $r0 (getVecRegisters($vectors, $capData))
+            {
+                # fixed register mappings can have aliases so use the actual register value for those.
+                my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0};
+
+                # If not writing treat just like a read
+                if (exists $noDest{$op})
+                {
+                    if (my $liveTime = $liveTime{$liveR})
+                    {
+                        $liveTime->[$#$liveTime][1] = $i;
+                        push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                    }
+                    else
+                    {
+                        warn "register used without initialization ($r0): $inst\n" unless $nowarn;
+                        push @{$liveTime{$liveR}}, [$i,$i];
+                    }
+                }
+                # If writing, push a new bracket on this register's stack.
+                elsif (my $liveTime = $liveTime{$liveR})
+                {
+                    if ($i > $liveTime->[$#$liveTime][1])
+                    {
+                        push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                    }
+                }
+                else
+                {
+                    # Initialize the liveTime stack for this register.
+                    push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                }
+            }
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+    #print Dumper(\%liveTime); exit(1);
+
+    # assign unassigned registers
+    # sort by most restricted, then most used, then name
+    foreach my $r (sort {
+                    $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} ||
+                    $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} ||
+                    $a cmp $b
+                  } keys %pairedBanks)
+    {
+        my $banks = $pairedBanks{$r}{banks};
+        my $avail = $regMap->{$r};
+
+        #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail;
+
+        # Pick a bank with zero or the smallest number of conflicts
+        BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3))
+        {
+            # pick an available register that matches the requested bank
+            foreach my $pos (0 .. $#$avail)
+            {
+                if ($bank == ($avail->[$pos] & 3))
+                {
+                    # assign it, while removing the assigned register from the pool
+                    $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+
+                    # update bank info for any unassigned pair
+                    $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}};
+                    last BANK;
+                }
+            }
+        }
+    }
+    # Now assign any remaining to first available
+    foreach my $r (sort keys %$regMap)
+    {
+        if (ref($regMap->{$r}) eq 'ARRAY')
+        {
+            $regMap->{$r} = 'R' . shift @{$regMap->{$r}};
+        }
+    }
+    #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap;
+
+    # apply the register mapping and assemble the instructions to op codes
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        # save the original and replace the register names with numbers
+        $instructs[$i]{orig} = $instructs[$i]{inst};
+        $instructs[$i]{inst} =~ s/(?<!\.)\b(\w+)\b(?!\[)/ exists($regMap->{$1}) ? $regMap->{$1} : $1 /ge;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            # update the register count
+            foreach my $r (qw(r0 r8 r20 r39))
+            {
+                next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ';
+
+                # get numeric portion of regname
+                my $val = substr $capData->{$r}, 1;
+
+                my @r0 = getVecRegisters($vectors, $capData);
+                my @r8 = getAddrVecRegisters($vectors, $capData);
+
+                # smart enough to count vector registers for memory instructions.
+                my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1;
+                my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1;
+
+                if ($val + $regInc > $regCnt)
+                {
+                    $regCnt = $val + $regInc;
+                    #print "$val $regCnt $regInc\n";
+                }
+            }
+            # update the barrier resource count
+            if ($op eq 'BAR')
+            {
+                if (exists $capData->{i8w4})
+                {
+                    $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt;
+                }
+                # if a barrier value is a register, assume the maximum
+                elsif (exists $capData->{r8})
+                {
+                    $barCnt = 16;
+                }
+            }
+            # Generate the op code.
+            my ($code, $reuse) = genCode($op, $gram, $capData);
+            $instructs[$i]{code} = $code;
+
+            # cache this for final pass when we want to calculate reuse stats.
+            if ($gram->{type}{reuse})
+                { $instructs[$i]{caps} = $capData; }
+            # use the parsed value of reuse for non-reuse type instructions
+            else
+                { $ctrl->{reuse}[($i & 3) - 1] = $reuse; }
+
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # final pass to piece together control codes
+    my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed);
+    foreach my $i (0 .. $#instructs)
+    {
+        # op code
+        if ($i & 3)
+        {
+            push @codes, $instructs[$i]{code};
+
+            if ($instructs[$i]{caps})
+            {
+                # calculate stats on registers
+                registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn);
+            }
+            if ($instructs[$i]{inst} =~ m'EXIT')
+            {
+                push @exitOffsets, (scalar(@codes)-1)*8;
+            }
+            elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)')
+            {
+                push @ctaidOffsets, (scalar(@codes)-1)*8;
+                $ctaidzUsed = 1 if $1 eq 'Z';
+            }
+        }
+        # control code
+        else
+        {
+            my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)};
+            push @codes,
+                ($ctrl->[0] <<  0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes
+                ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59);  # reuse codes
+        }
+    }
+
+    # return the kernel data
+    return {
+        RegCnt       => $regCnt,
+        BarCnt       => $barCnt,
+        ExitOffsets  => \@exitOffsets,
+        CTAIDOffsets => \@ctaidOffsets,
+        CTAIDZUsed   => $ctaidzUsed,
+        ConflictCnt  => $reuseHistory{conflicts},
+        ReuseCnt     => $reuseHistory{reuse},
+        ReuseTot     => $reuseHistory{total},
+        ReusePct     => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0),
+        KernelData   => \@codes,
+    };
+}
+
+# Useful for testing op code coverage of existing code, extracting new codes and flags
+sub Test
+{
+    my ($fh, $printConflicts, $all) = @_;
+
+    my @instructs;
+    my %reuseHistory;
+    my ($pass, $fail) = (0,0);
+
+    while (my $line = <$fh>)
+    {
+        my (@ctrl, @reuse);
+
+        next unless processSassCtrlLine($line, \@ctrl, \@reuse);
+
+        foreach my $fileReuse (@reuse)
+        {
+            $line = <$fh>;
+
+            my $inst = processSassLine($line) or next;
+
+            $inst->{reuse} = $fileReuse;
+            my $fileCode = $inst->{code};
+
+            if (exists $relOffset{$inst->{op}})
+            {
+                # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+                $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e;
+            }
+
+            my $match = 0;
+            foreach my $gram (@{$grammar{$inst->{op}}})
+            {
+                my $capData = parseInstruct($inst->{inst}, $gram) or next;
+                my @caps;
+
+                # Run in test mode to list what capture groups were captured
+                my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps);
+
+                # Detect register bank conflicts but only for reuse type instructions.
+                # If a bank conflict is avoided by a reuse flag then ignore it.
+                registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse};
+
+                $inst->{caps}      = join ', ', sort @caps;
+                $inst->{codeDiff}  = $fileCode  ^ $code;
+                $inst->{reuseDiff} = $fileReuse ^ $reuse;
+
+                # compare calculated and file values
+                if ($code == $fileCode && $reuse == $fileReuse)
+                {
+                    $inst->{grade} = 'PASS';
+                    push @instructs, $inst if $all;
+                    $pass++;
+                }
+                else
+                {
+                    $inst->{grade} = 'FAIL';
+                    push @instructs, $inst;
+                    $fail++;
+                }
+                $match = 1;
+                last;
+            }
+            unless ($match)
+            {
+                $inst->{grade}     = 'FAIL';
+                $inst->{codeDiff}  = $fileCode;
+                $inst->{reuseDiff} = $fileReuse;
+                push @instructs, $inst;
+                $fail++;
+            }
+        }
+    }
+    my %maxLen;
+    foreach (@instructs)
+    {
+        $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}};
+    }
+    my ($lastOp, $template);
+    foreach my $inst (sort {
+        $a->{op}        cmp $b->{op}        ||
+        $a->{codeDiff}  <=> $b->{codeDiff}  ||
+        $a->{reuseDiff} <=> $b->{reuseDiff} ||
+        $a->{ins}       cmp $b->{ins}
+        } @instructs)
+    {
+        if ($lastOp ne $inst->{op})
+        {
+            $lastOp   = $inst->{op};
+            $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s   %s\n";
+            printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s   %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures);
+        }
+        printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)};
+    }
+    my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0;
+
+    printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n",
+        $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total};
+
+    return $fail;
+}
+
+# Convert cuobjdump sass to the working format
+sub Extract
+{
+    my ($in, $out, $params) = @_;
+
+    my %paramMap;
+    my %constants =
+    (
+        blockDimX => 'c[0x0][0x8]',
+        blockDimY => 'c[0x0][0xc]',
+        blockDimZ => 'c[0x0][0x10]',
+        gridDimX  => 'c[0x0][0x14]',
+        gridDimY  => 'c[0x0][0x18]',
+        gridDimZ  => 'c[0x0][0x1c]',
+    );
+    print $out "<CONSTANT_MAPPING>\n";
+
+    foreach my $const (sort keys %constants)
+    {
+        print $out "    $const : $constants{$const}\n";
+        $paramMap{$constants{$const}} = $const;
+    }
+    print $out "\n";
+
+    foreach my $p (@$params)
+    {
+        my ($ord,$offset,$size,$align) = split ':', $p;
+
+        if ($size > 4)
+        {
+            my $num = 0;
+            $offset = hex $offset;
+            while ($size > 0)
+            {
+                my $param = sprintf 'param_%d[%d]', $ord, $num;
+                my $const = sprintf 'c[0x0][0x%x]', $offset;
+                $paramMap{$const} = $param;
+                print $out "    $param : $const\n";
+                $size   -= 4;
+                $offset += 4;
+                $num    += 1;
+            }
+        }
+        else
+        {
+            my $param = sprintf 'param_%d', $ord;
+            my $const = sprintf 'c[0x0][%s]', $offset;
+            $paramMap{$const} = $param;
+            print $out "    $param : $const\n";
+        }
+    }
+    print $out "</CONSTANT_MAPPING>\n\n";
+
+    my %labels;
+    my $labelnum = 1;
+
+    my @data;
+    FILE: while (my $line = <$in>)
+    {
+        my (@ctrl, @ruse);
+        next unless processSassCtrlLine($line, \@ctrl, \@ruse);
+
+        CTRL: foreach my $ctrl (@ctrl)
+        {
+            $line = <$in>;
+
+            my $inst = processSassLine($line) or next CTRL;
+
+            # Convert branch/jump/call addresses to labels
+            if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)')
+            {
+                my $target = hex($1);
+
+                # skip the final BRA and stop processing the file
+                last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8);
+
+                # check to see if we've already generated a label for this target address
+                my $label = $labels{$target};
+                unless ($label)
+                {
+                    # generate a label name and cache it
+                    $label = $labels{$target} = "TARGET$labelnum";
+                    $labelnum++;
+                }
+                # replace address with name
+                $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/;
+            }
+            $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg;
+
+            $inst->{ctrl} = printCtrl($ctrl);
+
+            push @data, $inst;
+        }
+    }
+    # make a second pass now that we have the complete instruction address to label mapping
+    foreach my $inst (@data)
+    {
+        print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}};
+        printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)};
+    }
+}
+
+my $CommentRe  = qr'^[\t ]*<COMMENT>.*?^\s*</COMMENT>\n?'ms;
+my $IncludeRe  = qr'^[\t ]*<INCLUDE\s+file="([^"]+)"\s*/?>\n?'ms;
+my $CodeRe     = qr'^[\t ]*<CODE(\d*)>(.*?)^\s*<\/CODE\1>\n?'ms;
+my $ConstMapRe = qr'^[\t ]*<CONSTANT_MAPPING>(.*?)^\s*</CONSTANT_MAPPING>\n?'ms;
+my $RegMapRe   = qr'^[\t ]*<REGISTER_MAPPING>(.*?)^\s*</REGISTER_MAPPING>\n?'ms;
+my $ScheduleRe = qr'^[\t ]*<SCHEDULE_BLOCK>(.*?)^\s*</SCHEDULE_BLOCK>\n?'ms;
+my $InlineRe   = qr'\[(\+|\-)(.+?)\1\]'ms;
+
+sub IncludeFile
+{
+    my ($file, $include) = @_;
+    my ($vol,$dir,$name) = File::Spec->splitpath($file);
+    local $/;
+    my $fh;
+    if (!open $fh, $file)
+    {
+        open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n";
+    }
+    my $content = <$fh>;
+    close $fh;
+    return $content;
+}
+
+sub Preprocess
+{
+    my ($file, $include, $debug, $regMap) = @_;
+
+    my $constMap = {};
+    my $removeRegMap;
+    if ($regMap)
+        { $removeRegMap = 1; }
+    else
+        { $regMap = {}; }
+
+    # include nested files
+    1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg;
+
+    # Strip out comments
+    $file =~ s|$CommentRe||g;
+
+    # Execute the CODE sections (old way to run code, to be deprecated)
+    1 while $file =~ s|$CodeRe|
+        my $out = eval "package MaxAs::MaxAs::CODE; $2";
+        $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg;
+
+    # Execute the inline code (new way)
+    $file =~ s|$InlineRe|
+        my ($type, $code) = ($1, $2);
+        my $out = eval "package MaxAs::MaxAs::CODE; $code";
+        $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg;
+
+    #Pull in the constMap
+    $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg;
+
+    my @newFile;
+    foreach my $line (split "\n", $file)
+    {
+        # skip comments
+        if ($line !~ m'^\s*(?:#|//).*')
+        {
+            $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg;
+        }
+        push @newFile, $line;
+    }
+    $file = join "\n", @newFile;
+
+    # Pull in the reg map first as the Scheduler will need it to handle vector instructions
+    # Remove the regmap if we're going on to assemble
+    $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg;
+
+    # Pick out the SCHEDULE_BLOCK sections
+    my @schedBlocks = $file =~ /$ScheduleRe/g;
+
+    # Schedule them
+    foreach my $i (0 .. $#schedBlocks)
+    {
+        # XMAD macros should only appear in SCHEDULE_BLOCKs
+        $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]);
+
+        $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug);
+    }
+
+    # Replace the results
+    $file =~ s|$ScheduleRe| shift @schedBlocks |eg;
+
+    return $file;
+}
+
+# break the registers down into source and destination categories for the scheduler
+my %srcReg   = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X);
+my %destReg  = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC);
+my %regops   = (%srcReg, %destReg);
+my @itypes   = qw(class lat rlat tput dual);
+
+sub Scheduler
+{
+    my ($block, $blockNum, $regMap, $debug) = @_;
+
+    my $vectors = $regMap->{__vectors};
+    my $lineNum = 0;
+
+    my (@instructs, @comments, $ordered, $first);
+    foreach my $line (split "\n", $block)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        unless (preProcessLine($line))
+        {
+            push @comments, $line if $line =~ m'\S';
+            next;
+        }
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # if the first instruction in the block is waiting on a dep, it should go first.
+            $inst->{first}   = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1;
+
+            # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block)
+            #$inst->{first}   = $inst->{ctrl} & 0x0000f ? 1 : 2;
+            $inst->{exeTime} = 0;
+            $inst->{order}   = $ordered++ if $ordered;
+            push @instructs, $inst;
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n";
+        }
+        # open an ORDERED block
+        elsif ($line =~ m'^<ORDERED>')
+        {
+            die "you cannot use nested <ORDERED> tags" if $ordered;
+            $ordered = 1;
+        }
+        # close an ORDERED block
+        elsif ($line =~ m'^</ORDERED>')
+        {
+            die "missing opening <ORDERED> for closing </ORDERED> tag" if !$ordered;
+            $ordered = 0;
+        }
+        else
+        {
+            die "badly formed line at block: $blockNum line: $lineNum: $line\n";
+        }
+    }
+
+    my (%writes, %reads, @ready, @schedule, $orderedParent);
+    # assemble the instructions to op codes
+    foreach my $instruct (@instructs)
+    {
+        my $match = 0;
+        foreach my $gram (@{$grammar{$instruct->{op}}})
+        {
+            my $capData = parseInstruct($instruct->{inst}, $gram) or next;
+            my (@dest, @src);
+
+            # copy over instruction types for easier access
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+
+            # A predicate prefix is treated as a source reg
+            push @src, $instruct->{predReg} if $instruct->{pred};
+
+            # Handle P2R and R2P specially
+            if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7})
+            {
+                my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src;
+                my $mask = hex($capData->{i20w7});
+                foreach my $p (0..6)
+                {
+                    if ($mask & (1 << $p))
+                    {
+                        push @$list, "P$p";
+                    }
+                    # make this instruction dependent on any predicates it's not setting
+                    # this is to prevent a race condition for any predicate sets that are pending
+                    elsif ($instruct->{op} eq 'R2P')
+                    {
+                        push @src, "P$p";
+                    }
+                }
+                # These instructions can't be dual issued
+                $instruct->{nodual} = 1;
+            }
+
+            # Populate our register source and destination lists, skipping any zero or true values
+            foreach my $operand (grep { exists $regops{$_} } sort keys %$capData)
+            {
+                # figure out which list to populate
+                my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src;
+
+                # Filter out RZ and PT
+                my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT';
+
+                if ($capData->{$operand} ne $badVal)
+                {
+                    # add the value to list with the correct prefix
+                    push @$list,
+                        $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) :
+                        $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) :
+                        $operand eq 'CC' ? 'CC' :
+                        $operand eq 'X'  ? 'CC' :
+                        getRegNum($regMap, $capData->{$operand});
+                }
+            }
+            $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39});
+
+            # Find Read-After-Write dependencies
+            foreach my $src (grep { exists $writes{$_} } @src)
+            {
+                # Memory operations get delayed access to registers but not to the predicate
+                my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat};
+
+                # the parent should be the most recently added dest op to the stack
+                foreach my $parent (@{$writes{$src}})
+                {
+                    # add this instruction as a child of the parent
+                    # set the edge to the total latency of reg source availability
+                    #print "R $parent->{inst}\n\t\t$instruct->{inst}\n";
+                    my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat};
+                    push @{$parent->{children}}, [$instruct, $latency - $regLatency];
+                    $instruct->{parents}++;
+
+                    # if the destination was conditionally executed, we also need to keep going back till it wasn't
+                    last unless $parent->{pred};
+                }
+            }
+
+            # Find Write-After-Read dependencies
+            foreach my $dest (grep { exists $reads{$_} } @dest)
+            {
+                # Flag this instruction as dependent to any previous read
+                foreach my $reader (@{$reads{$dest}})
+                {
+                    # no need to stall for these types of dependencies
+                    #print "W $reader->{inst} \t\t\t $instruct->{inst}\n";
+                    push @{$reader->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                # Once dependence is marked we can clear out the read list (unless this write was conditional).
+                # The assumption here is that you would never want to write out a register without
+                # subsequently reading it in some way prior to writing it again.
+                delete $reads{$dest} unless $instruct->{pred};
+            }
+
+            # Enforce instruction ordering where requested
+            if ($instruct->{order})
+            {
+                if ($orderedParent)
+                {
+                    push @{$orderedParent->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                $orderedParent = $instruct;
+            }
+            elsif ($orderedParent)
+                {  $orderedParent = 0; }
+
+            # For a dest reg, push it onto the write stack
+            unshift @{$writes{$_}}, $instruct foreach @dest;
+
+            # For a src reg, push it into the read list
+            push @{$reads{$_}}, $instruct foreach @src;
+
+            # if this instruction has no dependencies it's ready to go
+            push @ready, $instruct if !exists $instruct->{parents};
+
+            $match = 1;
+            last;
+        }
+        die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match;
+    }
+    %writes = ();
+    %reads  = ();
+
+    if (@ready)
+    {
+        # update dependent counts for sorting hueristic
+        my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" };
+
+        countUniqueDescendants($readyParent, {});
+        updateDepCounts($readyParent, {});
+
+        # sort the initial ready list
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    # Process the ready list, adding new instructions to the list as we go.
+    my $clock = 0;
+    while (my $instruct = shift @ready)
+    {
+        my $stall = $instruct->{stall};
+
+        # apply the stall to the previous instruction
+        if (@schedule && $stall < 16)
+        {
+            my $prev = $schedule[$#schedule];
+
+            # if stall is greater than 4 then also yield
+            # the yield flag is required to get stall counts 12-15 working correctly.
+            $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0;
+            $prev->{ctrl} |= $stall;
+            $clock += $stall;
+        }
+        # For stalls bigger than 15 we assume the user is managing it with a barrier
+        else
+        {
+            $instruct->{ctrl} &= 0x1fff0;
+            $instruct->{ctrl} |= 1;
+            $clock += 1;
+        }
+        print "$clock: $instruct->{inst}\n" if $debug;
+
+        # add a new instruction to the schedule
+        push @schedule, $instruct;
+
+        # update each child with a new earliest execution time
+        if (my $children = $instruct->{children})
+        {
+            foreach (@$children)
+            {
+                my ($child, $latency) = @$_;
+
+                # update the earliest clock value this child can safely execute
+                my $earliest = $clock + $latency;
+                $child->{exeTime} = $earliest if $child->{exeTime} < $earliest;
+
+                print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug;
+
+                # decrement parent count and add to ready queue if none remaining.
+                push @ready, $child if --$child->{parents} < 1;
+            }
+            delete $instruct->{children};
+        }
+
+        # update stall and mix values in the ready queue on each iteration
+        foreach my $ready (@ready)
+        {
+            # calculate how many instructions this would cause the just added instruction to stall.
+            $stall = $ready->{exeTime} - $clock;
+            $stall = 1 if $stall < 1;
+
+            # if using the same compute resource as the prior instruction then limit the throughput
+            if ($ready->{class} eq $instruct->{class})
+            {
+                $stall = $ready->{tput} if $stall < $ready->{tput};
+            }
+            # dual issue with a simple instruction (tput <= 2)
+            # can't dual issue two instructions that both load a constant
+            elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} &&
+                   $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const}))
+            {
+                $stall = 0;
+            }
+            $ready->{stall} = $stall;
+
+            # add an instruction class mixing huristic that catches anything not handled by the stall
+            $ready->{mix} = $ready->{class} ne $instruct->{class} || 0;
+        }
+
+        # sort the ready list by stall time, mixing huristic, dependencies and line number
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $a->{stall}   <=> $b->{stall}  ||
+            $b->{mix}     <=> $a->{mix}    ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    my $out;
+    #$out .= "$_\n" foreach @comments;
+    $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule;
+    return $out;
+}
+
+sub setConstMap
+{
+    my ($constMap, $constMapText) = @_;
+
+    foreach my $line (split "\n", $constMapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my ($name, $value) = split '\s*:\s*', $line;
+
+        $constMap->{$name} = $value;
+    }
+    return;
+}
+
+sub setRegisterMap
+{
+    my ($regMap, $regmapText) = @_;
+
+    my $vectors = $regMap->{__vectors} ||= {};
+    my $regBank = $regMap->{__regbank} ||= {};
+    my %aliases;
+
+    foreach my $line (split "\n", $regmapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my $auto  = $line =~ /~/;
+        my $share = $line =~ /=/;
+
+        my ($regNums, $regNames) = split '\s*[:~=]\s*', $line;
+
+        my (@numList, @nameList, %vecAliases);
+        foreach my $num (split '\s*,\s*', $regNums)
+        {
+            my ($start, $stop) = split '\s*\-\s*', $num;
+            die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop;
+            push @numList, ($start .. $stop||$start);
+        }
+        foreach my $fullName (split '\s*,\s*', $regNames)
+        {
+            if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$')
+            {
+                my ($name1, $name2, $bank) = ($1, $3, $4);
+                foreach (split '\s*\|\s*', $2)
+                {
+                    my ($start, $stop) = split '\s*\-\s*';
+                    foreach my $r (map "$name1$_$name2", $start .. $stop||$start)
+                    {
+                        # define an alias for use in vector instructions that omits the number portion
+                        $aliases{$r} = "$name1$name2" unless exists $aliases{$r};
+                        push @nameList, $r;
+                        $regBank->{$r} = $bank if $auto && defined $bank;
+                        warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank;
+                    }
+                }
+            }
+            elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$')
+            {
+                push @nameList, $1;
+                $regBank->{$1} = $2 if $auto && defined $2;
+                warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2;
+            }
+            else
+            {
+                die "Bad register name: '$fullName' at: $line\n";
+            }
+        }
+        die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList;
+        die "Missmatched register mapping at: $line\n" if $share && @numList > 1;
+
+        # detect if this list is monotonically ascending with no gaps
+        my $i = 0;
+        while ($i < $#numList-1)
+        {
+            last if $numList[$i] + 1 != $numList[$i+1];
+            $i++;
+        }
+        my $ascending = $i+1 == $#numList;
+
+        foreach my $n (0..$#nameList)
+        {
+            die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]};
+
+            if ($auto)
+            {
+                # assign possible values to be assigned on assembly
+                $regMap->{$nameList[$n]} = \@numList;
+            }
+            elsif ($share)
+            {
+                # each name shares the same single register
+                $regMap->{$nameList[$n]} = 'R' . $numList[0];
+            }
+            else
+            {
+                $regMap->{$nameList[$n]} = 'R' . $numList[$n];
+                # flag any even register as a potential vector
+                if ($ascending && ($numList[$n] & 1) == 0)
+                {
+                    # constrain potential range to vector alignment
+                    my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3);
+                    if ($end <= $#nameList)
+                    {
+                        $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ];
+                        #setup an alias for the base name without the number
+                        if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}})
+                        {
+                            $regMap->{$aliases{$nameList[$n]}}  = $regMap->{$nameList[$n]};
+                            $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]};
+                            delete $aliases{$nameList[$n]};
+                        }
+                    }
+                }
+            }
+        }
+    }
+    #print Dumper($regMap); exit(1);
+}
+
+sub preProcessLine
+{
+    # strip leading space
+    $_[0] =~ s|^\s+||;
+
+    # preserve comment but check for emptiness
+    my $val = shift;
+
+    # strip comments
+    $val =~ s{(?:#|//).*}{};
+
+    # skip blank lines
+    return $val =~ m'\S';
+}
+
+# traverse the graph and count total descendants per node.
+# only count unique nodes (by lineNum)
+sub countUniqueDescendants
+{
+    my ($node, $edges) = @_;
+
+    #warn "$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+
+            $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges);
+        }
+    }
+    else
+    {
+        return $node->{lineNum};
+    }
+    return ($node->{lineNum}, keys %{$node->{deps}});
+}
+# convert hash to count for easier sorting.
+sub updateDepCounts
+{
+    my ($node, $edges) = @_;
+
+    #warn "$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (@$children)
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+            updateDepCounts($child->[0], $edges);
+        }
+    }
+    $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0;
+}
+
+# Detect register bank conflicts and calculate reuse stats
+sub registerHealth
+{
+    my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_;
+
+    my (@banks, @conflicts);
+
+    foreach my $slot (qw(r8 r20 r39))
+    {
+        my $r = $capData->{$slot} or next;
+        next if $r eq 'RZ';
+
+        my $slotHist = $reuseHistory->{$slot} ||= {};
+
+        $reuseHistory->{total}++;
+
+        # if this register is in active reuse then ignore for bank conflict checking.
+        if (exists $slotHist->{$r})
+        {
+            $reuseHistory->{reuse}++;
+        }
+        else
+        {
+            # extract number from reg and take the modulo-4 value.  This is the bank id.
+            my $bank = substr($r,1) & 3;
+
+            # check for conflict
+            if ($banks[$bank] && $banks[$bank] ne $r)
+            {
+                push @conflicts, $banks[$bank] if !@conflicts;
+                push @conflicts, $r;
+
+                $reuseHistory->{conflicts}++;
+            }
+            $banks[$bank] = $r;
+        }
+
+        # update the history
+        if ($reuseFlags & $reuseSlots{$slot})
+            { $slotHist->{$r} = 1; }
+        else
+            { delete $slotHist->{$r};  }
+    }
+    if ($inst && @conflicts && !$nowarn)
+    {
+        printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts);
+    }
+    return scalar @conflicts;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+MaxAs::MaxAs - Assembler for NVIDIA Maxwell architecture
+
+=head1 SYNOPSIS
+
+    maxas.pl [opts]
+
+=head1 DESCRIPTION
+
+See the documentation at: https://github.com/NervanaSystems/maxas
+
+=head1 SEE ALSO
+
+See the documentation at: https://github.com/NervanaSystems/maxas
+
+
+=head1 AUTHOR
+
+Scott Gray, E<lt>sgray@nervanasys.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+=cut
diff --git a/Assembler/MaxAs/blib/lib/MaxAs/MaxAsGrammar.pm b/Assembler/MaxAs/blib/lib/MaxAs/MaxAsGrammar.pm
new file mode 100644
index 0000000..fc61543
--- /dev/null
+++ b/Assembler/MaxAs/blib/lib/MaxAs/MaxAsGrammar.pm
@@ -0,0 +1,1437 @@
+package MaxAs::MaxAsGrammar;
+
+use strict;
+use Carp;
+use Exporter;
+use Data::Dumper;
+our @ISA = qw(Exporter);
+
+our @EXPORT = qw(
+    %grammar %flags
+    parseInstruct genCode genReuseCode
+    processAsmLine processSassLine processSassCtrlLine
+    replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters
+);
+
+require 5.10.0;
+
+# Helper functions for operands
+sub getI
+{
+    my ($orig, $pos, $mask) = @_;
+    my $val = $orig;
+    my $neg = $val =~ s|^\-||;
+
+    # parse out our custom index immediates for addresses
+    if ($val  =~ m'^(\d+)[xX]<([^>]+)>')
+    {
+        # allow any perl expression and multiply result by leading decimal.
+        # also allow global scalar varibles in the expression.
+        my $mul = $1;
+        my $exp = $2;
+        # strip leading zeros (don't interpret numbers as octal)
+        $exp =~ s/(?<!\d)0+(?=[1-9])//g;
+        my @globals = $exp =~ m'\$\w+'g;
+        my $our = @globals ? ' our (' . join(',',@globals) . ');' : '';
+        $val = $mul * eval "package MaxAs::MaxAs::CODE;$our $exp";
+        #print "$val = $mul x $exp\n"; # if $our;
+    }
+    # hexidecial value
+    elsif ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # otherwise val is a simple decimal value that doesn't need to be modified
+
+    if ( $neg )
+    {
+        # if the mask removes the sign bit the "neg" flag adds it back on the code somewhere else
+        $val = -$val;
+        $val &= $mask;
+    }
+    if (($val & $mask) != $val)
+    {
+        die sprintf "Immediate value out of range(0x%x): 0x%x ($orig)\n", $mask, $val;
+    }
+    return $val << $pos;
+}
+sub getF
+{
+    my ($val, $pos, $type, $trunc) = @_;
+    # hexidecial value
+    if ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # support infinity
+    elsif ($val =~ m'INF'i)
+    {
+        $val = $trunc ? ($type eq 'f' ? 0x7f800 : 0x7ff00) : 0x7f800000;
+    }
+    else
+    {
+        $val = unpack(($type eq 'f' ? 'L' : 'Q'), pack $type, $val);
+
+        # strip off sign bit if truncating.  It will added elsewhere in the code by the flag capture.
+        $val = ($val >> $trunc) & 0x7ffff if $trunc;
+    }
+    return $val << $pos;
+}
+sub getR
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^R(\d+|Z)$' && $1 < 255)
+    {
+        $val = $1 eq 'Z' ? 0xff : $1;
+    }
+    else
+    {
+        die "Bad register name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getP
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^P(\d|T)$' && $1 < 7)
+    {
+        $val = $1 eq 'T' ? 7 : $1;
+    }
+    else
+    {
+        die "Bad predicate name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 }
+
+# Map operands into their value and position in the op code.
+my %operands =
+(
+    p0      => sub { getP($_[0], 0)  },
+    p3      => sub { getP($_[0], 3)  },
+    p12     => sub { getP($_[0], 12) },
+    p29     => sub { getP($_[0], 29) },
+    p39     => sub { getP($_[0], 39) },
+    p45     => sub { getP($_[0], 45) },
+    p48     => sub { getP($_[0], 48) },
+    p58     => sub { getP($_[0], 58) },
+    r0      => sub { getR($_[0], 0)  },
+    r8      => sub { getR($_[0], 8)  },
+    r20     => sub { getR($_[0], 20) },
+    r28     => sub { getR($_[0], 28) },
+    r39s20  => sub { getR($_[0], 39) },
+    r39     => sub { getR($_[0], 39) },
+    r39a    => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20
+    c20     => sub { getC($_[0])     },
+    c39     => sub { getC($_[0])     },
+    c34     => sub { hex($_[0]) << 34 },
+    c36     => sub { hex($_[0]) << 36 },
+    f20w32  => sub { getF($_[0], 20, 'f')        },
+    f20     => sub { getF($_[0], 20, 'f', 12)    },
+    d20     => sub { getF($_[0], 20, 'd', 44)    },
+    i8w4    => sub { getI($_[0], 8,  0xf)        },
+    i20     => sub { getI($_[0], 20, 0x7ffff)    },
+    i20w6   => sub { getI($_[0], 20, 0x3f)       },
+    i20w7   => sub { getI($_[0], 20, 0x7f)       },
+    i20w8   => sub { getI($_[0], 20, 0xff)       },
+    i20w12  => sub { getI($_[0], 20, 0xfff)      },
+    i20w24  => sub { getI($_[0], 20, 0xffffff)   },
+    i20w32  => sub { getI($_[0], 20, 0xffffffff) },
+    i31w4   => sub { getI($_[0], 31, 0xf)        },
+    i34w13  => sub { getI($_[0], 34, 0x1fff)     },
+    i36w20  => sub { getI($_[0], 36, 0xfffff)    },
+    i39w8   => sub { getI($_[0], 39, 0xff)       },
+    i28w8   => sub { getI($_[0], 28, 0xff)       },
+    i28w20  => sub { getI($_[0], 28, 0xfffff)    },
+    i48w8   => sub { getI($_[0], 48, 0xff)       },
+    i51w5   => sub { getI($_[0], 51, 0x1f)       },
+    i53w5   => sub { getI($_[0], 53, 0x1f)       },
+);
+
+# Rules for operands and their closely tied flags
+my $hex     = qr"0[xX][0-9a-fA-F]+";
+my $iAddr   = qr"\d+[xX]<[^>]+>";
+my $immed   = qr"$hex|$iAddr|\d+"o;
+my $reg     = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\
+my $p       = qr"P[0-6T]";
+my $noPred  = qr"(?<noPred>)";
+my $pred    = qr"\@(?<predNot>\!)?P(?<predNum>[0-6]) ";
+my $p0      = qr"(?<p0>$p)"o;
+my $p3      = qr"(?<p3>$p)"o;
+my $p12     = qr"(?<p12not>\!)?(?<p12>$p)"o;
+my $p29     = qr"(?<p29not>\!)?(?<p29>$p)"o;
+my $p39     = qr"(?<p39not>\!)?(?<p39>$p)"o;
+my $p45     = qr"(?<p45>$p)"o;
+my $p48     = qr"(?<p48>$p)"o;
+my $p58     = qr"(?<p58>$p)"o;
+my $r0      = qr"(?<r0>$reg)";
+my $r0cc    = qr"(?<r0>$reg)(?<CC>\.CC)?";
+my $r8      = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse1>\.reuse)?";
+my $r20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse2>\.reuse)?";
+my $r28     = qr"(?<r28>$reg)";
+my $r39s20  = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r39s20>(?<r20>$reg))\|?(?:\.(?<r39part>H0|H1))?(?<reuse2>\.reuse)?";
+my $r39     = qr"(?<r39neg>\-)?(?<r39>$reg)(?:\.(?<r39part>H0|H1))?(?<reuse3>\.reuse)?";
+my $r39a    = qr"(?<r39a>(?<r39>$reg))(?<reuse3>\.reuse)?";
+my $c20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3))?"o;
+my $c20x    = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20partx>H0|H1|B0|B1|B2|B3))?"o;
+my $c20s39  = qr"(?<r39neg>\-)?c\[(?<c34>$hex)\]\s*\[(?<c39>$hex)\]"o;
+my $f20w32  = qr"(?<f20w32>(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))";
+my $f20     = qr"(?<f20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $d20     = qr"(?<d20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $i8w4    = qr"(?<i8w4>$immed)"o;
+my $i20     = qr"(?<i20>(?<neg>\-)?$immed)(?<r20neg>\.NEG)?"o;
+my $i20w6   = qr"(?<i20w6>$immed)"o;
+my $i20w7   = qr"(?<i20w7>$immed)"o;
+my $i20w8   = qr"(?<i20w8>$immed)"o;
+my $i20w12  = qr"(?<i20w12>$immed)"o;
+my $i20w24  = qr"(?<i20w24>\-?$immed)"o;
+my $i20w32  = qr"(?<i20w32>\-?$immed)"o;
+my $i39w8   = qr"(?<i39w8>\-?$immed)"o;
+my $i28w8   = qr"(?<i28w8>$immed)"o;
+my $i28w20  = qr"(?<i28w20>\-?$immed)"o;
+my $i31w4   = qr"(?<i31w4>$immed)"o;
+my $i34w13  = qr"(?<i34w13>$immed)"o;
+my $i36w20  = qr"(?<i36w20>$immed)"o;
+my $i48w8   = qr"(?<i48w8>$immed)"o;
+my $i51w5   = qr"(?<i51w5>$immed)"o;
+my $i53w5   = qr"(?<i53w5>$immed)"o;
+my $ir20    = qr"$i20|$r20"o;
+my $cr20    = qr"$c20|$r20"o;
+my $icr20   = qr"$i20|$c20|$r20"o;
+my $fcr20   = qr"$f20|$c20|$r20"o;
+my $cr39    = qr"$c20s39|$r39"o;
+my $dr20    = qr"$d20|$r20"o;
+
+# Instruction specific rules for capturing various flags
+my $u32   = qr"(?<U32>\.U32)?";
+my $ftz   = qr"(?<FTZ>\.FTZ)?";
+my $sat   = qr"(?<SAT>\.SAT)?";
+my $rnd   = qr"(?:\.(?<rnd>RN|RM|RP|RZ))?";
+my $round = qr"(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?";
+my $fcmp  = qr"(?<cmp>\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)";
+my $icmp  = qr"\.(?<cmp>LT|EQ|LE|GT|NE|GE)";
+my $bool  = qr"\.(?<bool>AND|OR|XOR|PASS_B)";
+my $bool2 = qr"\.(?<bool2>AND|OR|XOR)";
+my $func  = qr"\.(?<func>COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)";
+my $rro   = qr"\.(?<func>SINCOS|EX2)";
+my $add3  = qr"(?:\.(?<type>X|RS|LS))?";
+my $lopz  = qr"(?:\.(?<z>NZ|Z) $p48,|(?<noz>))"o;
+my $X     = qr"(?<X>\.X)?";
+my $tld   = qr"(?<NODEP>NODEP\.)?(?:(?<reuse1>T)|(?<reuse2>P))";
+my $chnls = qr"(?<chnls>R|RGBA)";
+my $sr    = qr"SR_(?<sr>\S+)";
+my $shf   = qr"(?<W>\.W)?(?:\.(?<type>U64|S64))?(?<HI>\.HI)?";
+my $xmad  = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<mode>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $xmadc = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<modec>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $vmad8 = qr"\.(?<sign1>[SU])(?<size1>8|16)\.(?<sign2>[SU])(?<size2>8|16)(?<PO>\.PO)?(?<SHR_7>\.SHR_7)?(?<SHR_15>\.SHR_15)?(?<SAT>\.SAT)?";
+my $vmad16= qr"\.(?<sign1>[SU])(?<size1>16)\.(?<sign2>[SU])(?<size2>16)";
+my $hilo  = qr"(?:\.(?<mode>XHI|XLO))?";
+my $vaddType = qr"(?:\.(?<UD>UD))?(?:\.(?<SD>SD))?(?:\.(?<sign1>[SU])(?<size1>8|16|32))?(?:\.(?<sign2>[SU])(?<size2>8|16|32))?";
+my $vaddMode = qr"(?:\.(?<mode>MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?";
+my $vmnmx = qr"(?:\.(?<MX>MX))?";
+my $x2x   = qr"\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)";
+my $prmt  = qr"(?:\.(?<mode>F4E|B4E|RC8|ECL|ECR|RC16))?";
+my $shfl  = qr"\.(?<mode>IDX|UP|DOWN|BFLY)";
+my $bar   = qr"\.(?<mode>SYNC|ARV|RED)(?:\.(?<red>POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?(<r20>)|(?<nor20>))(?(<red>), $p39|(?<nop39>))"o;
+my $b2r   = qr"\.RESULT $r0(?:, $p45|(?<nop45>))"o;
+my $dbar  = qr"(?<SB>SB0|SB1|SB2|SB3|SB4|SB5)";
+my $dbar2 = qr" {(?<db5>5)?,?(?<db4>4)?,?(?<db3>3)?,?(?<db2>2)?,?(?<db1>1)?,?(?<db0>0)?}";
+my $mbar  = qr"\.(?<mode>CTA|GL|SYS)";
+my $addr  = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i20w24)?\]"o;
+my $addr2 = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i28w20)?\]"o;
+my $ldc   = qr"c\[(?<c36>$hex)\]\s*$addr"o;
+my $atom  = qr"(?<E>\.E)?(?:\.(?<mode>ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?<type>|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)";
+my $vote  = qr"\.(?<mode>ALL|ANY|EQ)"o;
+my $memType  = qr"(?<type>\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)";
+my $memCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|CI|CS|CV|IL|WT))?";
+
+
+
+# class: hardware resource that shares characteristics with types
+# lat  : pipeline depth where relevent, placeholder for memory ops
+# blat : barrier latency, typical fetch time for memory operations. Highly variable.
+# rlat : operand read latency for memory ops
+# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op.
+# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession.
+# dual : whether this instruction type can be dual issued
+# reuse: whether this instruction type accepts register reuse flags.
+
+# Some of these values are guesses and need to be updated from micro benchmarks.
+# We may need to split these classes up further.
+my $s2rT  = {class => 's2r',   lat => 2,   blat => 25,  rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $smemT = {class => 'mem',   lat => 2,   blat => 30,  rlat => 2, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $gmemT = {class => 'mem',   lat => 2,   blat => 200, rlat => 4, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $x32T  = {class => 'x32',   lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 1};
+my $x64T  = {class => 'x64',   lat => 2,   blat => 128, rlat => 0, rhold => 0,  tput => 128, dual => 0, reuse => 1};
+my $shftT = {class => 'shift', lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $cmpT  = {class => 'cmp',   lat => 13,  blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $qtrT  = {class => 'qtr',   lat => 8,   blat => 0,   rlat => 4, rhold => 0,  tput => 1,   dual => 1, reuse => 0};
+my $rroT  = {class => 'rro',   lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $voteT = {class => 'vote',  lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+
+
+# Create map of op names to rules
+our %grammar =
+(
+    #Floating Point Instructions
+    FADD     => [ { type => $x32T,  code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FADD32I  => [ { type => $x32T,  code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FCHK     => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o,                     } ], #Partial?
+    FCMP     => [ { type => $cmpT,  code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o,            } ],
+    FFMA     => [
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o,         },
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o,     },
+                ],
+    FMNMX    => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o,                } ],
+    FMUL     => [ { type => $x32T,  code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FMUL32I  => [ { type => $x32T,  code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FSET     => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o,       } ],
+    FSETP    => [ { type => $cmpT,  code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ],
+    MUFU     => [ { type => $qtrT,  code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o,                              } ],
+    RRO      => [ { type => $rroT,  code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o,                               } ],
+    DADD     => [ { type => $x64T,  code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o,                        } ],
+    DFMA     => [ { type => $x64T,  code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o,                  } ],
+    DMNMX    => [ { type => $cmpT,  code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o,                     } ],
+    DMUL     => [ { type => $x64T,  code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o,                        } ],
+    DSET     => [ { type => $cmpT,  code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o,            } ],
+    DSETP    => [ { type => $cmpT,  code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o,      } ],
+    FSWZADD  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o,                                    } ], #TODO
+
+    HADD2     => [ { type => $x32T,  code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o,               } ],
+    HMUL2     => [ { type => $x32T,  code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o,               } ],
+    HFMA2     => [ { type => $x32T,  code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o,         } ],
+    HSETP2    => [ { type => $cmpT,  code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial
+
+    #Integer Instructions
+    BFE       => [ { type => $shftT,  code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o,                          } ],
+    BFI       => [ { type => $shftT,  code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o,                        } ],
+    FLO       => [ { type => $s2rT,   code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o,                              } ],
+    IADD      => [ { type => $x32T,   code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o,                         } ],
+    IADD32I   => [ { type => $x32T,   code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o,                         } ],
+    IADD3     => [ { type => $x32T,   code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o,                 } ],
+    ICMP      => [ { type => $cmpT,   code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o,              } ],
+    IMNMX     => [ { type => $shftT,  code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o,                  } ],
+    ISET      => [ { type => $shftT,  code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o,       } ],
+    ISETP     => [ { type => $cmpT,   code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ],
+    ISCADD    => [ { type => $shftT,  code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o,                   } ],
+    ISCADD32I => [ { type => $shftT,  code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o,               } ],
+    LEA       => [
+                   { type => $cmpT,   code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o,                      },
+                   { type => $shftT,  code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o,                    },
+                   { type => $shftT,  code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o,          },
+                   { type => $shftT,  code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o,          },
+                 ],
+    LOP       => [ { type => $x32T,   code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?<INV>~)?$icr20(?<INV>\.INV)?;"o, } ],
+    LOP32I    => [ { type => $x32T,   code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o,                     } ],
+    LOP3      => [
+                   { type => $x32T,   code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o,            },
+                   { type => $x32T,   code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o,            },
+                 ],
+    POPC      => [ { type => $s2rT,   code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o,                                    } ],
+    SHF       => [
+                   { type => $shftT,  code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o,                  },
+                   { type => $shftT,  code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o,                  },
+                 ],
+    SHL       => [ { type => $shftT,  code => 0x5c48000000000000, rule => qr"^$pred?SHL(?<W>\.W)? $r0, $r8, $icr20;"o,                    } ],
+    SHR       => [ { type => $shftT,  code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o,                          } ],
+    XMAD      => [
+                   { type => $x32T,   code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o,                 },
+                   { type => $x32T,   code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o,            },
+                   { type => $x32T,   code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o,                  },
+                 ],
+    # XMAD replaces these
+    IMAD      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o,   } ], #TODO
+    IMADSP    => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO
+    IMUL      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o,   } ], #TODO
+
+    #Conversion Instructions
+    F2F => [ { type => $qtrT,  code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ],
+    F2I => [ { type => $qtrT,  code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o,         } ],
+    I2F => [ { type => $qtrT,  code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o,               } ],
+    I2I => [ { type => $qtrT,  code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o,               } ],
+
+    #Movement Instructions
+    MOV    => [ { type => $x32T,  code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o,                   } ],
+    MOV32I => [ { type => $x32T,  code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o,   } ],
+    PRMT   => [ { type => $x32T,  code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ],
+    SEL    => [ { type => $x32T,  code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o,        } ],
+    SHFL   => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ],
+
+    #Predicate/CC Instructions
+    PSET   => [ { type => $cmpT,  code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o,       } ],
+    PSETP  => [ { type => $cmpT,  code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ],
+    CSET   => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o,  } ], #TODO
+    CSETP  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO
+    P2R    => [ { type => $x32T,  code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o,   } ],
+    R2P    => [ { type => $cmpT,  code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o,   } ],
+
+    #Texture Instructions
+    # Handle the commonly used 1D texture functions.. but save the others for later
+    TLD    => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial
+    TLDS   => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial
+    TEX    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o,   } ], #TODO
+    TLD4   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o,  } ], #TODO
+    TXQ    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o,   } ], #TODO
+    TEXS   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o,  } ], #TODO
+    TLD4S  => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO
+
+    #Compute Load/Store Instructions
+    LD     => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o,      } ],
+    ST     => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o,      } ],
+    LDG    => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o,           } ],
+    STG    => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o,           } ],
+    LDS    => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o,           } ],
+    STS    => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o,           } ],
+    LDL    => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o,           } ],
+    STL    => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o,           } ],
+    LDC    => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o,            } ],
+    # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded).
+    ATOM   => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o,   } ],
+    ATOMS  => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o,  } ],
+    RED    => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o,                      } ],
+    CCTL   => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o,  } ], #TODO
+    CCTLL  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO
+    CCTLT  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO
+
+    #Surface Memory Instructions (haven't gotten to these yet..)
+    SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO
+    SULD   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o,   } ], #TODO
+    SURED  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o,  } ], #TODO
+    SUST   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o,   } ], #TODO
+
+    #Control Instructions
+    BRA    => [
+                { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?<U>\.U)? $i20w24;"o,         },
+                { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?<U>\.U)? CC\.EQ, $i20w24;"o, },
+              ],
+    BRX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o,                      } ], #TODO
+    JMP    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o,                      } ], #TODO
+    JMX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o,                      } ], #TODO
+    SSY    => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o,                 } ],
+    SYNC   => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o,                          } ],
+    CAL    => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o,                 } ],
+    JCAL   => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o,                } ],
+    PRET   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o,                     } ], #TODO
+    RET    => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o,                           } ],
+    BRK    => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o,                           } ],
+    PBK    => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o,                 } ],
+    CONT   => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o,                          } ],
+    PCNT   => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o,                } ],
+    EXIT   => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o,                          } ],
+    PEXIT  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o,                    } ], #TODO
+    BPT    => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o,           } ],
+
+    #Miscellaneous Instructions
+    NOP    => [ { type => $x32T,  code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o,                                     } ],
+    CS2R   => [ { type => $x32T,  code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o,                           } ],
+    S2R    => [ { type => $s2rT,  code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o,                            } ],
+    B2R    => [ { type => $x32T,  code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o,                                 } ],
+    BAR    => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o,                                 } ],
+    DEPBAR => [
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, },
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o,              },
+              ],
+    MEMBAR => [ { type => $x32T,  code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o,                             } ],
+    VOTE   => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?<nor0>))$p45, $p39;"o, } ],
+    R2B    => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o,                                } ], #TODO
+
+    #Video Instructions... Need to finish
+    VADD   => [   { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMAD   => [
+                  { type => $x32T,  code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, },
+                  { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, },
+              ],
+    VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMNMX    => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+
+    VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+);
+
+# Create map of capture groups to op code flags that need to be added (or removed)
+my @flags = grep /\S/, split "\n", q{;
+
+BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD
+0x0100000000000000 neg
+
+FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP
+0x0100000000000000 neg
+
+PSET, PSETP
+0x0000000000008000 p12not
+0x0000000100000000 p29not
+
+FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE
+0x0000040000000000 p39not
+
+IADD, IADD3, XMAD, LEA, IMNMX
+0x0000800000000000 CC
+
+IADD32I
+0x0010000000000000 CC
+
+LEA
+0x0000000000000000 X
+
+SHF
+0x0004000000000000 W
+0x0001000000000000 HI
+
+SHF: type
+0x0000004000000000 U64
+0x0000006000000000 S64
+
+SHR, IMNMX, ISETP, ISET, ICMP, BFE
+0x0001000000000000 U32
+
+SHL
+0x0000008000000000 W
+
+SHFL
+0x0000000010000000 i20w8
+0x0000000020000000 i34w13
+
+SHFL: mode
+0x0000000000000000 IDX
+0x0000000040000000 UP
+0x0000000080000000 DOWN
+0x00000000c0000000 BFLY
+
+IMNMX: mode
+0x0000080000000000 XLO
+0x0000180000000000 XHI
+
+ISETP, ISET, ICMP: cmp
+0x0002000000000000 LT
+0x0004000000000000 EQ
+0x0006000000000000 LE
+0x0008000000000000 GT
+0x000a000000000000 NE
+0x000c000000000000 GE
+
+ISETP, ISET, PSETP, PSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+PSETP, PSET: bool2
+0x0000000000000000 AND
+0x0000000001000000 OR
+0x0000000002000000 XOR
+
+ISETP, ISET
+0x0000080000000000 X
+
+LOP: bool
+0x0000000000000000 AND
+0x0000020000000000 OR
+0x0000040000000000 XOR
+0x0000060000000000 PASS_B
+
+LOP:
+0x0000010000000000 INV
+
+LOP: z
+0x0000200000000000 Z
+0x0000300000000000 NZ
+
+LOP
+0x0007000000000000 noz
+
+LOP32I: bool
+0x0000000000000000 AND
+0x0020000000000000 OR
+0x0040000000000000 XOR
+
+PRMT: mode
+0x0001000000000000 F4E
+0x0002000000000000 B4E
+0x0003000000000000 RC8
+0x0004000000000000 ECL
+0x0005000000000000 ECR
+0x0006000000000000 RC16
+
+XMAD: type1
+0x0000000000000000 U16
+0x0001000000000000 S16
+
+XMAD: type2
+0x0000000000000000 U16
+0x0002000000000000 S16
+
+XMAD: mode
+0x0000002000000000 MRG
+0x0000001000000000 PSL
+0x0008000000000000 CHI
+0x0004000000000000 CLO
+0x000c000000000000 CSFU
+
+XMAD: modec
+0x0004000000000000 CLO
+0x0008000000000000 CHI
+0x000c000000000000 CSFU
+0x0040000000000000 X
+0x0080000000000000 PSL
+0x0100000000000000 MRG
+
+XMAD
+0x0010000000000000 CBCC
+
+XMAD: r8part
+0x0000000000000000 H0
+0x0020000000000000 H1
+
+XMAD: r20part
+0x0000000000000000 H0
+0x0000000800000000 H1
+
+XMAD: r20partx
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+XMAD: r39part
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part
+0x0000000000000000 B0
+0x0000001000000000 B1
+0x0000002000000000 B2
+0x0000003000000000 B3
+0x0000001000000000 H1
+0x0000000000000000 H0
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part
+0x0000000000000000 B0
+0x0000000010000000 B1
+0x0000000020000000 B2
+0x0000000030000000 B3
+0x0000000010000000 H1
+0x0000000000000000 H0
+
+VMAD
+0x0040000000000000 r8neg
+0x0020000000000000 r39neg
+0x0008000000000000 SHR_7
+0x0010000000000000 SHR_15
+0x0060000000000000 PO
+0x0080000000000000 SAT
+
+VMNMX
+0x0100000000000000 MX
+
+VADD, VABSDIFF, VMNMX
+0x0080000000000000 SAT
+0x0040000000000000 UD
+0x0040000000000000 SD
+
+VSET: cmp
+0x0040000000000000 LT
+0x0080000000000000 EQ
+0x00c0000000000000 LE
+0x0100000000000000 GT
+0x0140000000000000 NE
+0x0180000000000000 GE
+
+VADD, VSET: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VABSDIFF: mode
+0x0003000000000000 ACC
+0x000b000000000000 MIN
+0x0013000000000000 MAX
+0x0023000000000000 MRG_16H
+0x002b000000000000 MRG_16L
+0x0033000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x003b000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMNMX: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1
+0x0000000000000000 U
+0x0001000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2
+0x0000000000000000 U
+0x0002000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size1
+0x0000000000000000 8
+0x0000004000000000 16
+0x0000006000000000 32
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size2
+0x0000000000000000 8
+0x0000000040000000 16
+0x0000000060000000 32
+
+IADD3: type
+0x0001000000000000 X
+0x0000002000000000 RS
+0x0000004000000000 LS
+
+IADD3: r8part
+0x0000000000000000 H0
+0x0000001000000000 H1
+
+IADD3: r20part
+0x0000000080000000 H0
+
+IADD3: r39part
+0x0000000200000000 H0
+
+IADD3
+0x0008000000000000 r8neg
+0x0004000000000000 r20neg
+0x0002000000000000 r39neg
+
+IADD
+0x0000080000000000 X
+0x0004000000000000 SAT
+
+IADD, ISCADD
+0x0002000000000000 r8neg
+0x0001000000000000 r20neg
+
+IADD32I
+0x0100000000000000 r8neg
+0x0020000000000000 X
+
+DEPBAR: SB
+0x0000000000000000 SB0
+0x0000000004000000 SB1
+0x0000000008000000 SB2
+0x000000000c000000 SB3
+0x0000000010000000 SB4
+0x0000000014000000 SB5
+
+DEPBAR: cmp
+0x0000000020000000 LE
+
+DEPBAR
+0x0000000000000001 db0
+0x0000000000000002 db1
+0x0000000000000004 db2
+0x0000000000000008 db3
+0x0000000000000010 db4
+0x0000000000000020 db5
+
+F2F, F2I, I2F, I2I: destWidth
+0x0000000000000000 8
+0x0000000000000100 16
+0x0000000000000200 32
+0x0000000000000300 64
+
+F2F, F2I, I2F, I2I: srcWidth
+0x0000000000000000 8
+0x0000000000000400 16
+0x0000000000000800 32
+0x0000000000000c00 64
+
+F2F, F2I, I2F, I2I: destSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000001000 S
+
+F2F, F2I, I2F, I2I: srcSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000002000 S
+
+F2I, I2F, I2I: r20part
+0x0000000000000000 H0
+0x0000040000000000 H1
+0x0000000000000000 B0
+0x0000020000000000 B1
+0x0000040000000000 B2
+0x0000060000000000 B3
+
+F2F: r20part
+0x0000000000000000 H0
+0x0000020000000000 H1
+
+F2F: round
+0x0000040000000000 ROUND
+0x0000048000000000 FLOOR
+0x0000050000000000 CEIL
+0x0000058000000000 TRUNC
+
+F2I: round
+0x0000000000000000 ROUND
+0x0000008000000000 FLOOR
+0x0000010000000000 CEIL
+0x0000018000000000 TRUNC
+
+HADD2, HMUL2: r8part
+0x0001000000000000 H0_H0
+0x0000000000000000 H1_H1
+
+HFMA2: r20part
+0x0000000020000000 H0_H0
+0x0000000030000000 H1_H1
+
+FADD, DADD, FMUL, DMUL, F2F, I2F: rnd
+0x0000000000000000 RN
+0x0000008000000000 RM
+0x0000010000000000 RP
+0x0000018000000000 RZ
+
+DFMA: rnd
+0x0000000000000000 RN
+0x0004000000000000 RM
+0x0008000000000000 RP
+0x000c000000000000 RZ
+
+FFMA: rnd
+0x0000000000000000 RN
+0x0008000000000000 RM
+0x0010000000000000 RP
+0x0018000000000000 RZ
+
+FFMA
+0x0020000000000000 FTZ
+
+F2F, F2I, FADD, FMUL, FMNMX
+0x0000100000000000 FTZ
+
+FADD32I
+0x0080000000000000 FTZ
+
+FMUL32I
+0x0020000000000000 FTZ
+
+FSET
+0x0080000000000000 FTZ
+
+FSETP, FCMP
+0x0000800000000000 FTZ
+
+HADD2, HMUL2
+0x0000008000000000 FTZ
+
+HFMA2
+0x0000002000000000 FTZ
+
+FADD, FFMA, FMUL, F2F, I2I
+0x0004000000000000 SAT
+
+FADD, DADD, FMNMX, DMNMX, MUFU
+0x0001000000000000 r8neg
+
+FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I
+0x0000200000000000 r20neg
+
+FMUL, DMUL, FFMA, DFMA
+0x0001000000000000 r20neg
+
+FFMA, DFMA
+0x0002000000000000 r39neg
+
+FADD, DADD, FMNMX, DMNMX
+0x0000400000000000 r8abs
+
+FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I
+0x0002000000000000 r20abs
+
+FSETP, DSETP, FSET, DSET
+0x0000080000000000 r8neg
+0x0000000000000040 r20neg
+0x0000000000000080 r8abs
+0x0000100000000000 r20abs
+
+RRO: func
+0x0000000000000000 SINCOS
+0x0000008000000000 EX2
+
+MUFU: func
+0x0000000000000000 COS
+0x0000000000100000 SIN
+0x0000000000200000 EX2
+0x0000000000300000 LG2
+0x0000000000400000 RCP
+0x0000000000500000 RSQ
+0x0000000000600000 RCP64H
+0x0000000000700000 RSQ64H
+
+FSETP, DSETP, FSET, DSET, FCMP: cmp
+0x0001000000000000 .LT
+0x0002000000000000 .EQ
+0x0003000000000000 .LE
+0x0004000000000000 .GT
+0x0004000000000000
+0x0005000000000000 .NE
+0x0006000000000000 .GE
+0x0007000000000000 .NUM
+0x0008000000000000 .NAN
+0x0009000000000000 .LTU
+0x000a000000000000 .EQU
+0x000b000000000000 .LEU
+0x000c000000000000 .GTU
+0x000d000000000000 .NEU
+0x000e000000000000 .GEU
+
+FSETP, DSETP, FSET, DSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+HSETP2: cmp
+0x0000002800000000 .NE
+
+HSETP2: bool
+0x0000000000000000 AND
+
+S2R: sr
+0x0000000000000000 LANEID
+0x0000000000200000 VIRTCFG
+0x0000000000300000 VIRTID
+0x0000000002100000 TID.X
+0x0000000002200000 TID.Y
+0x0000000002300000 TID.Z
+0x0000000002500000 CTAID.X
+0x0000000002600000 CTAID.Y
+0x0000000002700000 CTAID.Z
+0x0000000003800000 EQMASK
+0x0000000003900000 LTMASK
+0x0000000003a00000 LEMASK
+0x0000000003b00000 GTMASK
+0x0000000003c00000 GEMASK
+
+CS2R: sr
+0x0000000005000000 CLOCKLO
+0x0000000005100000 CLOCKHI
+0x0000000005200000 GLOBALTIMERLO
+0x0000000005300000 GLOBALTIMERHI
+
+B2R
+0x0000e00000000000 nop45
+
+BAR
+0x0000100000000000 i8w4
+0x0000080000000000 nor20
+0x0000038000000000 nop39
+
+BAR: mode
+0x0000000000000000 SYNC
+0x0000000100000000 ARV
+0x0000000200000000 RED
+
+BAR: red
+0x0000000000000000 POPC
+0x0000000800000000 AND
+0x0000001000000000 OR
+
+MEMBAR: mode
+0x0000000000000000 CTA
+0x0000000000000100 GL
+0x0000000000000200 SYS
+
+VOTE: mode
+0x0000000000000000 ALL
+0x0001000000000000 ANY
+0x0002000000000000 EQ
+
+VOTE
+0x00000000000000ff nor0
+
+BRA
+0x0000000000000080 U
+
+TLDS: chnls
+0x0010000000000000 RGBA
+
+TLDS
+0x0002000000000000 NODEP
+
+LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS
+0x000000000000ff00 nor8
+
+LD, ST: type
+0x0000000000000000 .U8
+0x0020000000000000 .S8
+0x0040000000000000 .U16
+0x0060000000000000 .S16
+0x0080000000000000
+0x0080000000000000 .32
+0x00a0000000000000 .64
+0x00c0000000000000 .128
+
+LD, ST: cache
+0x0100000000000000 CG
+0x0200000000000000 CS
+0x0300000000000000 CV
+0x0300000000000000 WT
+
+LDG, STG, LDS, STS, LDL, STL, LDC: type
+0x0000000000000000 .U8
+0x0001000000000000 .S8
+0x0002000000000000 .U16
+0x0003000000000000 .S16
+0x0004000000000000
+0x0004000000000000 .32
+0x0005000000000000 .64
+0x0006000000000000 .128
+
+LDG, STG: cache
+0x0000400000000000 CG
+0x0000800000000000 CI
+0x0000800000000000 CS
+0x0000c00000000000 CV
+0x0000c00000000000 WT
+
+LDL: cache
+0x0000200000000000 CI
+
+LDC: cache
+0x0000100000000000 IL
+
+LDG, STG, LDS, STS, LDL, STL, LDC
+0x0000200000000000 E
+
+LDS
+0x0000100000000000 U
+
+RED: type
+0x0000000000000000
+0x0000000000100000 .S32
+0x0000000000200000 .U64
+0x0000000000300000 .F32.FTZ.RN
+0x0000000000400000 .F16x2.FTZ.RN
+0x0000000000500000 .S64
+
+RED: mode
+0x0000000000000000 ADD
+0x0000000000800000 MIN
+0x0000000001000000 MAX
+0x0000000001800000 INC
+0x0000000002000000 DEC
+0x0000000002800000 AND
+0x0000000003000000 OR
+0x0000000003800000 XOR
+
+ATOM: type
+0x0000000000000000
+0x0002000000000000 .S32
+0x0004000000000000 .U64
+0x0006000000000000 .F32.FTZ.RN
+0x0008000000000000 .F16x2.FTZ.RN
+0x000a000000000000 .S64
+0x0002000000000000 .64
+
+ATOM, RED
+0x0001000000000000 E
+
+ATOM: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x03f0000000000000 CAS
+
+ATOMS: type
+0x0000000000000000
+0x0000000010000000 .S32
+0x0000000020000000 .U64
+0x0000000030000000 .S64
+0x0010000000000000 .64
+
+ATOMS: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x0240000000000000 CAS
+};
+
+# The existence of a capture group can map directly to an op code adjustment, or...
+# The named capture group value can map the op code adjustmemt from among several options
+our %flags;
+my (@ops, $flag);
+foreach my $line (@flags)
+{
+    if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)')
+    {
+        my $val = hex($1);
+        # named rules (op: name)
+        if ($flag)
+            { $flags{$_}{$flag}{$2} = $val foreach @ops; }
+        # simple existence check rules
+        else
+            { $flags{$_}{$2}        = $val foreach @ops; }
+    }
+    else
+    {
+        my ($ops, $name) = split ':\s*', $line;
+        @ops = split ',\s*', $ops;
+        $flag = $name;
+    }
+}
+
+sub parseInstruct
+{
+    my ($inst, $grammar) = @_;
+    return unless $inst =~ $grammar->{rule};
+    my %capData = %+;
+    return \%capData;
+}
+
+# for immediate or constant operands and a given opcode, bits 56-63 get transformed
+my %immedOps = map { $_ => 1 } qw(i20 f20 d20);
+my %immedCodes =
+(
+    0x5c => 0x64,
+    0x5b => 0x6d,
+    0x59 => 0x6b,
+    0x58 => 0x68,
+);
+my %constCodes =
+(
+    c20 => 0x10,
+    c39 => 0x08,
+);
+my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4);
+
+# just pick out the reuse code and nothing else
+sub genReuseCode
+{
+    my $capData = shift;
+    my $reuse = 0;
+    $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes;
+    return $reuse;
+}
+
+# Generate an op code from regex capture data
+# if you pass in a test array ref it will populate it with the matching capture groups
+sub genCode
+{
+    my ($op, $grammar, $capData, $test) = @_;
+
+    my $flags     = $flags{$op};
+    my $code      = $grammar->{code};
+    my $reuse     = 0;
+    my $immedCode = $immedCodes{$code >> 56};
+
+    #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I';
+
+    # process the instruction predicate (if valid for this instuction)
+    if (exists $capData->{noPred})
+    {
+        delete $capData->{noPred};
+        push @$test, 'noPred' if $test;
+    }
+    else
+    {
+        my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7;
+        push @$test, 'predNum' if $test;
+        if (exists $capData->{predNot})
+        {
+            $p |= 8;
+            push @$test, 'predNot' if $test;
+        }
+        $code ^= $p << 16;
+        delete @{$capData}{qw(predNum predNot)};
+
+    }
+    # process the register reuse flags
+    foreach my $rcode (qw(reuse1 reuse2 reuse3))
+    {
+        if (delete $capData->{$rcode})
+        {
+            $reuse |= $reuseCodes{$rcode};
+            push @$test, $rcode if $test;
+        }
+    }
+
+    foreach my $capture (keys %$capData)
+    {
+        # change the base code for immediate versions of the op
+        if (exists $immedOps{$capture})
+            { $code ^= $immedCode << 56; }
+        # change the base code for constant versions of the op
+        elsif (exists $constCodes{$capture})
+            { $code ^= $constCodes{$capture} << 56; }
+
+        # if capture group is an operand then process and add that data to code
+        if (exists $operands{$capture})
+        {
+            # don't process the r20 that comes with the r39s20 capture
+            unless ($capture eq 'r20' && exists $capData->{r39s20})
+            {
+                $code ^= $operands{$capture}->($capData->{$capture});
+                push @$test, $capture if $test;
+            }
+        }
+
+        # Add matching flags (an operand might also add/remove a flag)
+        if (exists $flags->{$capture})
+        {
+            # a named multivalue flag
+            if (ref $flags->{$capture})
+            {
+                $code ^= $flags->{$capture}{$capData->{$capture}};
+                push @$test, "$capture:$capData->{$capture}" if $test;
+            }
+            # a simple exists flag
+            else
+            {
+                $code ^= $flags->{$capture};
+                push @$test, $capture if $test;
+            }
+        }
+        elsif (!exists $operands{$capture} && !$test)
+        {
+            # Every capture group should be acted upon.  Missing one is a bug.
+            warn "UNUSED: $op: $capture: $capData->{$capture}\n";
+            warn Dumper($flags);
+        }
+    }
+
+    return $code, $reuse;
+}
+
+
+my $CtrlRe = qr'(?<ctrl>[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])';
+my $PredRe = qr'(?<pred>@!?(?<predReg>P\d)\s+)';
+my $InstRe = qr"$PredRe?(?<op>\w+)(?<rest>[^;]*;)"o;
+my $CommRe = qr'(?<comment>.*)';
+
+sub processAsmLine
+{
+    my ($line, $lineNum) = @_;
+
+    if ($line =~ m"^$CtrlRe(?<space>\s+)$InstRe$CommRe"o)
+    {
+        return {
+            lineNum => $lineNum,
+            pred    => $+{pred},
+            predReg => $+{predReg},
+            space   => $+{space},
+            op      => $+{op},
+            comment => $+{comment},
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            ctrl    => readCtrl($+{ctrl}, $line),
+        };
+    }
+    return undef;
+}
+
+sub processSassLine
+{
+    my $line = shift;
+
+    if ($line =~ m"^\s+/\*(?<num>[0-9a-f]+)\*/\s+$InstRe\s+/\* (?<code>0x[0-9a-f]+)"o)
+    {
+        return {
+            num     => hex($+{num}),
+            pred    => $+{pred},
+            op      => $+{op},
+            ins     => normalizeSpacing($+{op} . $+{rest}),
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            code    => hex($+{code}),
+        };
+    }
+    return undef;
+}
+
+sub processSassCtrlLine
+{
+    my ($line, $ctrl, $ruse) = @_;
+
+    return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)';
+
+    my $code = hex($1);
+    if (ref $ctrl)
+    {
+        push @$ctrl, ($code & 0x000000000001ffff) >> 0;
+        push @$ctrl, ($code & 0x0000003fffe00000) >> 21;
+        push @$ctrl, ($code & 0x07fffc0000000000) >> 42;
+    }
+    if (ref $ruse)
+    {
+        push @$ruse, ($code & 0x00000000001e0000) >> 17;
+        push @$ruse, ($code & 0x000003c000000000) >> 38;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+    }
+    return 1;
+}
+
+sub replaceXMADs
+{
+    my $file = shift;
+
+# XMAD.LO d, a, b, c, x;
+# ----------------------
+# XMAD.MRG x, a, b.H1, RZ;
+# XMAD d, a, b, c;
+# XMAD.PSL.CBCC d, a.H1, x.H1, d;
+# ----------------------
+# XMAD d, a, 0xffff, c;
+# XMAD.PSL d, a.H1, 0xffff, d;
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD\.LO\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<x>\w+)\s*;$CommRe/
+
+        die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s
+%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s;
+%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;',
+                @+{qw(ctrl space pred d a b c x comment)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>-?$immed|\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/
+
+        die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<c>\w+)\s*;$CommRe/
+
+        die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    #TODO: add more XMAD macros
+    return $file;
+}
+# convert extra spaces to single spacing to make our re's simplier
+sub normalizeSpacing
+{
+    my $inst = shift;
+    $inst =~ s/\t/ /g;
+    $inst =~ s/\s{2,}/ /g;
+    return $inst;
+}
+
+
+# map binary control notation on to easier to work with format.
+sub printCtrl
+{
+    my $code = shift;
+
+    my $stall = ($code & 0x0000f) >> 0;
+    my $yield = ($code & 0x00010) >> 4;
+    my $wrtdb = ($code & 0x000e0) >> 5;  # write dependency barier
+    my $readb = ($code & 0x00700) >> 8;  # read  dependency barier
+    my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier
+
+    $yield = $yield ? '-' : 'Y';
+    $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1;
+    $readb = $readb == 7 ? '-' : $readb + 1;
+    $watdb = $watdb ? sprintf('%02x', $watdb) : '--';
+
+    return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall;
+}
+sub readCtrl
+{
+    my ($ctrl, $context) = @_;
+    my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl;
+
+    $watdb = $watdb eq '--' ? 0 : hex $watdb;
+    $readb = $readb eq '-'  ? 7 : $readb - 1;
+    $wrtdb = $wrtdb eq '-'  ? 7 : $wrtdb - 1;
+    $yield = $yield eq 'y' || $yield eq 'Y'  ? 0 : 1;
+    $stall = hex $stall;
+
+    die sprintf('wait dep out of range(0x00-0x3f): %x at %s',   $watdb, $context) if $watdb != ($watdb & 0x3f);
+
+    return
+        $watdb << 11 |
+        $readb << 8  |
+        $wrtdb << 5  |
+        $yield << 4  |
+        $stall << 0;
+}
+
+sub getRegNum
+{
+    my ($regMap, $regName) = @_;
+
+    return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName};
+}
+
+sub getVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r0} or return;
+
+    return if $regName eq 'RZ';
+
+    if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+3);
+        }
+        confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4;
+        return @{$vectors->{$regName}};
+    }
+    return $regName;
+}
+
+sub getAddrVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r8} or return;
+
+    return if $regName eq 'RZ';
+
+    if (exists $capData->{E})
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        print Dumper($vectors) unless exists $vectors->{$regName};
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    return $regName;
+}
+
+__END__
+
+
+
diff --git a/Assembler/MaxAs/blib/lib/auto/MaxAs/MaxAs/.exists b/Assembler/MaxAs/blib/lib/auto/MaxAs/MaxAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/blib/man1/.exists b/Assembler/MaxAs/blib/man1/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/blib/man3/.exists b/Assembler/MaxAs/blib/man3/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/blib/man3/MaxAs::MaxAs.3pm b/Assembler/MaxAs/blib/man3/MaxAs::MaxAs.3pm
new file mode 100644
index 0000000..9f95fff
--- /dev/null
+++ b/Assembler/MaxAs/blib/man3/MaxAs::MaxAs.3pm
@@ -0,0 +1,170 @@
+.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.13)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is turned on, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.ie \nF \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    nr % 0
+.    rr F
+.\}
+.el \{\
+.    de IX
+..
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "MaxAs::MaxAs 3"
+.TH MaxAs::MaxAs 3 "2016-02-04" "perl v5.10.1" "User Contributed Perl Documentation"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+MaxAs::MaxAs \- Assembler for NVIDIA Maxwell architecture
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\&    maxas.pl [opts]
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+See the documentation at: https://github.com/NervanaSystems/maxas
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+See the documentation at: https://github.com/NervanaSystems/maxas
+.SH "AUTHOR"
+.IX Header "AUTHOR"
+Scott Gray, <sgray@nervanasys.com<gt>
+.SH "COPYRIGHT AND LICENSE"
+.IX Header "COPYRIGHT AND LICENSE"
+The \s-1MIT\s0 License (\s-1MIT\s0)
+.PP
+Copyright (c) 2014 Scott Gray
+.PP
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the \*(L"Software\*(R"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+.PP
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+.PP
+\&\s-1THE\s0 \s-1SOFTWARE\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R", \s-1WITHOUT\s0 \s-1WARRANTY\s0 \s-1OF\s0 \s-1ANY\s0 \s-1KIND\s0, \s-1EXPRESS\s0 \s-1OR\s0
+\&\s-1IMPLIED\s0, \s-1INCLUDING\s0 \s-1BUT\s0 \s-1NOT\s0 \s-1LIMITED\s0 \s-1TO\s0 \s-1THE\s0 \s-1WARRANTIES\s0 \s-1OF\s0 \s-1MERCHANTABILITY\s0,
+\&\s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0 \s-1AND\s0 \s-1NONINFRINGEMENT\s0. \s-1IN\s0 \s-1NO\s0 \s-1EVENT\s0 \s-1SHALL\s0 \s-1THE\s0
+\&\s-1AUTHORS\s0 \s-1OR\s0 \s-1COPYRIGHT\s0 \s-1HOLDERS\s0 \s-1BE\s0 \s-1LIABLE\s0 \s-1FOR\s0 \s-1ANY\s0 \s-1CLAIM\s0, \s-1DAMAGES\s0 \s-1OR\s0 \s-1OTHER\s0
+\&\s-1LIABILITY\s0, \s-1WHETHER\s0 \s-1IN\s0 \s-1AN\s0 \s-1ACTION\s0 \s-1OF\s0 \s-1CONTRACT\s0, \s-1TORT\s0 \s-1OR\s0 \s-1OTHERWISE\s0, \s-1ARISING\s0 \s-1FROM\s0,
+\&\s-1OUT\s0 \s-1OF\s0 \s-1OR\s0 \s-1IN\s0 \s-1CONNECTION\s0 \s-1WITH\s0 \s-1THE\s0 \s-1SOFTWARE\s0 \s-1OR\s0 \s-1THE\s0 \s-1USE\s0 \s-1OR\s0 \s-1OTHER\s0 \s-1DEALINGS\s0 \s-1IN\s0
+\&\s-1THE\s0 \s-1SOFTWARE\s0.
diff --git a/Assembler/MaxAs/blib/script/.exists b/Assembler/MaxAs/blib/script/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/blib/script/maxas.pl b/Assembler/MaxAs/blib/script/maxas.pl
new file mode 100755
index 0000000..91cfa30
--- /dev/null
+++ b/Assembler/MaxAs/blib/script/maxas.pl
@@ -0,0 +1,289 @@
+#!/usr/bin/perl
+
+eval 'exec /usr/bin/perl  -S $0 ${1+"$@"}'
+    if 0; # not running under some shell
+use strict;
+use MaxAs::Cubin;
+use MaxAs::MaxAs;
+use Data::Dumper;
+use File::Spec;
+
+require 5.10.0;
+
+$Data::Dumper::Sortkeys = 1;
+
+my $mode = shift;
+
+# List cubin contents
+if ($mode =~ /^\-?\-l/i)
+{
+    my $cubinFile = shift or usage();
+
+    my $cubin = MaxAs::Cubin->new($cubinFile);
+
+    my $arch    = $cubin->arch;
+    my $class   = $cubin->class;
+    my $asize   = $cubin->address_size;
+    my $kernels = $cubin->listKernels;
+    my $symbols = $cubin->listSymbols;
+
+    printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize;
+
+    foreach my $ker (sort keys %$kernels)
+    {
+        printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)};
+    }
+    foreach my $sym (sort keys %$symbols)
+    {
+        printf "Symbol: %s\n", $sym;
+    }
+}
+# Test that the assembler can reproduce the op codes this cubin or sass contains
+elsif ($mode =~ /^\-?\-t/i)
+{
+    my $reg  = shift if $ARGV[0] =~ /^\-?\-r/i;
+    my $all  = shift if $ARGV[0] =~ /^\-?\-a/i;
+    my $file = shift or usage();
+    my $fh;
+    # sass file
+    if (-T $file)
+    {
+        open $fh, $file or die "$file: $!";
+    }
+    # cubin file
+    else
+    {
+        my $cubin = MaxAs::Cubin->new($file);
+        my $arch  = $cubin->arch;
+
+        open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!";
+        my $first = <$fh>;
+        if ($first =~ /cuobjdump fatal/)
+        {
+            print $first;
+            exit(1);
+        }
+    }
+    exit(MaxAs::MaxAs::Test($fh, $reg, $all) ? 1 : 0);
+}
+# Extract an asm file containing the desired kernel
+elsif ($mode =~ /^\-?\-e/i)
+{
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $cubinFile = shift or usage();
+    my $asmFile   = shift;
+    my $cubin     = MaxAs::Cubin->new($cubinFile);
+    my $arch      = $cubin->arch;
+    my $kernels   = $cubin->listKernels;
+
+    #default the kernel name if not specified.
+    $kernelName ||= (sort keys %$kernels)[0];
+
+    my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName";
+
+    open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_50 -sass -fun $kernelName $cubinFile: $!";
+    my $first = <$in>;
+    if ($first =~ /cuobjdump fatal/)
+    {
+        print $first;
+        exit(1);
+    }
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n";
+
+    print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt));
+
+    print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n";
+
+    print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params};
+
+    print $out "#\n# Instructions:\n\n";
+
+    MaxAs::MaxAs::Extract($in, $out, $kernel->{Params});
+
+    close $out if $asmFile;
+    close $in;
+}
+# Extract a kernel from a sass dump
+elsif ($mode =~ /^\-?\-s/i)
+{
+    my $sassFile  = shift or usage();
+    my $asmFile   = shift;
+
+    open my $in, $sassFile or die "$sassFile: $!";
+
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    MaxAs::MaxAs::Extract($in, $out, []);
+
+    close $out if $asmFile;
+    close $in;
+}
+# Insert the kernel asm back into the cubin:
+elsif ($mode =~ /^\-?\-i/i)
+{
+    my $nowarn;
+    if ($ARGV[0] =~ /^\-?\-w/i)
+    {
+        $nowarn = shift;
+    }
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $noReuse   = shift if $ARGV[0] =~ /^\-?\-n/i;
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';"
+    }
+
+    my $asmFile   = shift or usage();
+    my $cubinFile = shift or usage();
+    my $newCubin  = shift || $cubinFile;
+
+    my $file;
+    if (open my $fh, $asmFile)
+    {
+        local $/;
+        $file = <$fh>;
+        close $fh;
+    }
+    else { die "$asmFile: $!" }
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    # extract the kernel name from the file
+    ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName;
+    die "asm file missing kernel name or is badly formatted" unless $kernelName;
+
+    my $kernel = MaxAs::MaxAs::Assemble($file, $include, !$noReuse, $nowarn);
+
+    my $cubin  = MaxAs::Cubin->new($cubinFile);
+    $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName";
+
+    $cubin->modifyKernel(%$kernel);
+
+    $cubin->write($newCubin);
+
+    printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n",
+        @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)};
+
+}
+# Preprocessing:
+elsif ($mode =~ /^\-?\-p/i)
+{
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';";
+    }
+    my $debug     = shift if $ARGV[0] =~ /^\-?\-d/i;
+    my $asmFile   = shift or usage();
+    my $asmFile2  = shift;
+
+    die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2;
+
+    open my $fh,  $asmFile or die "$asmFile: $!";
+    local $/;
+    my $file = <$fh>;
+    close $fh;
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    if ($asmFile2)
+    {
+        open $fh, ">$asmFile2" or die "$asmFile2: $!";
+    }
+    else
+    {
+        $fh = \*STDOUT;
+    }
+    print $fh MaxAs::MaxAs::Preprocess($file, $include, $debug);
+    close $fh;
+}
+# get version information
+elsif ($mode =~ /^\-?\-v/i)
+{
+    print "$MaxAs::MaxAs::VERSION\n";
+}
+else
+{
+    print "$mode\n";
+    usage();
+}
+
+exit(0);
+
+
+
+sub usage
+{
+    print <<EOF;
+Usage:
+
+  List kernels and symbols:
+
+    maxas.pl --list|-l <cubin_file>
+
+  Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes.
+  Also useful for extending the missing grammar rules.  Defaults to only showing failures without --all.
+  With the --reg flag it will show register bank conflicts not hidden by reuse flags.
+
+    maxas.pl --test|-t [--reg|-r] [--all|-a] <cubin_file | cuobjdump_sass_file>
+
+  Extract a single kernel into an asm file from a cubin.
+  Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin.
+
+    maxas.pl --extract|-e [--kernel|-k kernel_name] <cubin_file> [asm_file]
+
+  Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes.
+  Include the debug flag to print out detailed scheduler info.
+
+    maxas.pl --pre|-p [--debug|-d] <asm_file> [new_asm_file]
+
+  Insert the kernel asm back into the cubin.  Overwrite existing or create new cubin.
+  Optionally you can skip register reuse flag auto insertion.  This allows you to observe
+  performance without any reuse or you can use it to set the flags manually in your sass.
+
+    maxas.pl --insert|-i [--noreuse|-n] <asm_file> <cubin_file> [new_cubin_file]
+
+  Display version information and exit:
+
+    maxas.pl --version|-v
+
+EOF
+    exit(1);
+}
+
+__END__
diff --git a/Assembler/MaxAs/cpanfile b/Assembler/MaxAs/cpanfile
new file mode 100644
index 0000000..e8281c5
--- /dev/null
+++ b/Assembler/MaxAs/cpanfile
@@ -0,0 +1,4 @@
+requires 'perl', '5.10.0';
+
+requires 'Carp', '1.29';
+requires 'Data::Dumper', '2.145';
diff --git a/Assembler/MaxAs/lib/MaxAs/Cubin.pm b/Assembler/MaxAs/lib/MaxAs/Cubin.pm
new file mode 100644
index 0000000..5900958
--- /dev/null
+++ b/Assembler/MaxAs/lib/MaxAs/Cubin.pm
@@ -0,0 +1,684 @@
+package MaxAs::Cubin;
+
+use strict;
+use Data::Dumper;
+
+my @Elf32_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    L   entry
+    L   phOffset
+    L   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf64_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    Q   entry
+    Q   phOffset
+    Q   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf32_PrgHdr = qw(
+    L   type
+    L   offset
+    L   vaddr
+    L   paddr
+    L   fileSize
+    L   memSize
+    L   flags
+    L   align
+);
+my @Elf64_PrgHdr = qw(
+    L   type
+    L   flags
+    Q   offset
+    Q   vaddr
+    Q   paddr
+    Q   fileSize
+    Q   memSize
+    Q   align
+);
+my @Elf32_SecHdr = qw(
+    L   name
+    L   type
+    L   flags
+    L   addr
+    L   offset
+    L   size
+    L   link
+    L   info
+    L   align
+    L   entSize
+);
+my @Elf64_SecHdr = qw(
+    L   name
+    L   type
+    Q   flags
+    Q   addr
+    Q   offset
+    Q   size
+    L   link
+    L   info
+    Q   align
+    Q   entSize
+);
+my @Elf32_SymEnt = qw(
+    L   name
+    L   value
+    L   size
+    C   info
+    C   other
+    S   shIndx
+);
+my @Elf64_SymEnt = qw(
+    L   name
+    C   info
+    C   other
+    S   shIndx
+    Q   value
+    Q   size
+);
+my @symBind = qw(LOCAL GLOBAL WEAK);
+
+# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C)
+my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC);
+
+$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr;
+$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr;
+$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr;
+$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt;
+
+$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr;
+$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr;
+$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr;
+$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt;
+
+$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr    ];
+$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ];
+$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ];
+$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ];
+
+$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr    ];
+$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ];
+$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ];
+$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ];
+
+# Load a cubin ELF file
+sub new
+{
+    my ($package, $file) = @_;
+
+    my $cubin = bless { fileName => $file }, $package;
+
+    open my $fh, $file or die "$file: $!";
+    binmode($fh);
+
+    # Read in assuming 32 bit header
+    my $data;
+    read $fh, $data, 0x34;
+    my $elfHdr = $cubin->{elfHdr} = {};
+    @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data;
+
+    # 1: 32bit, 2: 64bit
+    my $class = $elfHdr->{fileClass};
+
+    # re-read in with 64 bit header if needed
+    if ($class == 2)
+    {
+        seek $fh, 0, 0;
+        read $fh, $data, 0x46;
+        @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data;
+
+        $cubin->{Class} = 64;
+    }
+    else
+    {
+        $cubin->{Class} = 32;
+    }
+
+    # verify sm_50 cubin
+    $cubin->{Arch} = $elfHdr->{flags} & 0xFF;
+    die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50;
+
+    $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32;
+
+    # Read in Program Headers
+    seek $fh, $elfHdr->{phOffset}, 0;
+    foreach (1 .. $elfHdr->{phNum})
+    {
+        read $fh, $data, $elfHdr->{phEntSize};
+
+        my %prgHdr = (Indx => $_ - 1);
+        @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data;
+        push @{$cubin->{prgHdrs}}, \%prgHdr;
+    }
+
+    # Read in Section Headers
+    seek $fh, $elfHdr->{shOffset}, 0;
+    foreach (1 .. $elfHdr->{shNum})
+    {
+        read $fh, $data, $elfHdr->{shEntSize};
+
+        my %secHdr = (Indx => $_ - 1);
+        @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data;
+        push @{$cubin->{secHdrs}}, \%secHdr;
+    }
+
+    # Read in Section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $data = '';
+        # Skip sections with no data (type NULL or NOBITS)
+        if ($secHdr->{size} && $secHdr->{type} != 8)
+        {
+            seek $fh, $secHdr->{offset}, 0;
+            read $fh, $data, $secHdr->{size};
+        }
+        # Convert string tables to maps
+        if ($secHdr->{type} == 3) # STRTAB
+        {
+            my $strTab = $secHdr->{StrTab} = {};
+            my $indx   = 0;
+            foreach my $str (split "\0", $data)
+            {
+                $strTab->{$indx} = $str;
+                $indx += 1 + length($str);
+            }
+        }
+        # Read in Symbol data
+        if ($secHdr->{type} == 2) # SYMTAB
+        {
+            my $offset = 0;
+            while ($offset < $secHdr->{size})
+            {
+                my $symEnt = {};
+                @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize});
+                $offset += $secHdr->{entSize};
+
+                push @{$secHdr->{SymTab}}, $symEnt;
+            }
+        }
+        # Cache raw data for further processing and writing
+        $secHdr->{Data} = unpack 'H*', $data;
+    }
+    close $fh;
+
+    # Update section headers with their names.  Map names directly to headers.
+    my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab};
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $secHdr->{Name} = $shStrTab->{$secHdr->{name}};
+        $cubin->{$secHdr->{Name}} = $secHdr;
+    }
+
+    # Update symbols with their names
+    # For the Global functions, extract kernel meta data
+    # Populate the kernel hash
+    my $strTab = $cubin->{'.strtab'}{StrTab};
+    foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}})
+    {
+        $symEnt->{Name} = $strTab->{$symEnt->{name}};
+
+        # Attach symbol to section
+        my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}];
+        $secHdr->{SymbolEnt} = $symEnt;
+
+        # Look for symbols tagged FUNC
+        if (($symEnt->{info} & 0x0f) == 0x02)
+        {
+            # Create a hash of kernels for output
+            my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr;
+
+            # Extract local/global/weak binding info
+            $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4];
+
+            # Extract the kernel instructions
+            $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ];
+
+            # Extract the max barrier resource identifier used and add 1. Should be 0-16.
+            # If a register is used as a barrier resource id, then this value is the max of 16.
+            $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20;
+
+            # Extract the number of allocated registers for this kernel.
+            $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24;
+
+            # Extract the size of shared memory this kernel uses.
+            my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"};
+            $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0;
+
+            # Attach constant0 section
+            $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"};
+
+            # Extract the kernel parameter data.
+            my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"};
+            if ($paramSec)
+            {
+                # Extract raw param data
+                my @data = unpack "L*", pack "H*", $paramSec->{Data};
+
+                $paramSec->{ParamData} = \@data;
+                $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ];
+
+                # Find the first param delimiter
+                my $idx = 0;
+                $idx++ while $idx < @data && $data[$idx] != 0x00080a04;
+
+                my $first = $data[$idx+2] & 0xFFFF;
+                #my $size  = $data[$idx+2] >> 16;
+                $idx += 4;
+
+                my @params;
+                while ($idx < @data && $data[$idx] == 0x000c1704)
+                {
+                    # Get the ordinal, offset, size and pointer alignment for each param
+                    my $ord    = $data[$idx+2] & 0xFFFF;
+                    my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16);
+                    my $psize  = $data[$idx+3] >> 18;
+                    my $align  = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0;
+                    unshift @params, "$ord:$offset:$psize:$align";
+                    $idx += 4;
+                }
+                my @staticParams = @data[0 .. ($idx-1)];
+
+                my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize);
+                while ($idx < @data)
+                {
+                    my $code = $data[$idx] & 0xffff;
+                    my $size = $data[$idx] >> 16;
+                    $idx++;
+
+                    # EIATTR_MAXREG_COUNT
+                    if ($code == 0x1b03)
+                    {
+                        $maxregCount = $size;
+                    }
+                    # EIATTR_S2RCTAID_INSTR_OFFSETS
+                    elsif ($code == 0x1d04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @ctaidOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_EXIT_INSTR_OFFSETS
+                    elsif ($code == 0x1c04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @exitOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CTAIDZ_USED
+                    elsif ($code == 0x0401)
+                    {
+                        $ctaidzUsed = 1;
+                    }
+                    # EIATTR_REQNTID
+                    elsif ($code == 0x1004)
+                    {
+                        while ($size > 0)
+                        {
+                            push @reqntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_MAX_THREADS
+                    elsif ($code == 0x0504)
+                    {
+                        while ($size > 0)
+                        {
+                            push @maxntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CRS_STACK_SIZE
+                    elsif ($code == 0x1e04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @stackSize, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    else
+                    {
+                        printf STDERR "Unknown Code 0x%02x (size:%d)\n", $code, $size;
+                    }
+                }
+                $kernelSec->{Params}   = \@params;
+                $kernelSec->{ParamCnt} = scalar @params;
+
+                $paramSec->{StaticParams} = \@staticParams;
+                $paramSec->{MAXREG_COUNT} = $maxregCount;
+                $paramSec->{ExitOffsets}  = \@exitOffsets;
+                $paramSec->{CTAIDOffsets} = \@ctaidOffsets;
+                $paramSec->{CTAIDZUsed}   = $ctaidzUsed;
+                $paramSec->{REQNTID}      = \@reqntid;
+                $paramSec->{MAXNTID}      = \@maxntid;
+                $paramSec->{STACKSIZE}    = \@stackSize;
+            }
+            # print Dumper($paramSec);
+            # exit();
+        }
+        # Note GLOBALs found in this cubin
+        elsif (($symEnt->{info} & 0x10) == 0x10)
+        {
+            $cubin->{Symbols}{$symEnt->{Name}} = $symEnt;
+        }
+    }
+
+    # print "phOffset: $elfHdr->{phOffset}\n";
+    # print "shOffset: $elfHdr->{shOffset}\n";
+    # foreach my $secHdr (@{$cubin->{secHdrs}})
+    # {
+    #     print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n";
+    # }
+    # my $p = 0;
+    # foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    # {
+    #     print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n";
+    #     $p++;
+    # }
+    # exit();
+
+    # print Dumper($cubin->{prgHdrs});
+    # exit();
+    return $cubin;
+}
+sub class
+{
+    return shift()->{Class};
+}
+sub arch
+{
+    return shift()->{Arch};
+}
+sub address_size
+{
+    return shift()->{AddressSize};
+}
+sub listKernels
+{
+    return shift()->{Kernels};
+}
+sub listSymbols
+{
+    return shift()->{Symbols};
+}
+sub getKernel
+{
+    my ($cubin, $kernel) = @_;
+    return $cubin->{Kernels}{$kernel};
+}
+
+sub modifyKernel
+{
+    my ($cubin, %params) = @_;
+
+    my $kernelSec    = $params{Kernel};
+    my $newReg       = $params{RegCnt};
+    my $newBar       = $params{BarCnt};
+    my $exitOffsets  = $params{ExitOffsets};
+    my $ctaidOffsets = $params{CTAIDOffsets};
+    my $ctaidzUsed   = $params{CTAIDZUsed};
+    my $newData      = $params{KernelData};
+    my $newSize      = @$newData * 8;
+
+    die "255 register max" if $newReg > 255;
+    die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63;
+    die "16 is max barrier count" if $newBar > 16;
+
+    my $paramSec    = $kernelSec->{ParamSec};
+    my $kernelName  = $kernelSec->{SymbolEnt}{Name};
+    my $maxregCount = $paramSec->{MAXREG_COUNT};
+    my $stackSize   = $paramSec->{STACKSIZE};
+
+    # update the kernel
+    $kernelSec->{KernelData} = $newData;
+    $kernelSec->{Data}       = unpack "H*", pack "Q*", @$newData;
+
+    if ($newReg != $kernelSec->{RegCnt})
+    {
+        print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n";
+        $kernelSec->{RegCnt} = $newReg;
+        $kernelSec->{info}  &= ~0xff000000;
+        $kernelSec->{info}  |= $newReg << 24;
+    }
+    if ($newBar != $kernelSec->{BarCnt})
+    {
+        print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n";
+        $kernelSec->{BarCnt} = $newBar;
+        $kernelSec->{flags} &= ~0x01f00000;
+        $kernelSec->{flags} |=  $newBar << 20;
+    }
+
+    my @paramData = @{$paramSec->{StaticParams}};
+
+    if (defined $maxregCount)
+    {
+        push @paramData, ($maxregCount << 16) | 0x1b03;
+    }
+
+    my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets;
+    my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}};
+
+    if ($newCTAIDs ne $oldCTAIDs)
+    {
+        print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n";
+    }
+    if (@$ctaidOffsets)
+    {
+        push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04;
+        push @paramData, @$ctaidOffsets;
+    }
+
+    my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets;
+    my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}};
+
+    if ($newExits ne $oldExits)
+    {
+        print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n";
+    }
+    if (@$exitOffsets)
+    {
+        push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04;
+        push @paramData, @$exitOffsets;
+    }
+
+    if ($ctaidzUsed != $paramSec->{CTAIDZUsed})
+    {
+        print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n";
+    }
+    if ($ctaidzUsed)
+    {
+        push @paramData, 0x0401;
+    }
+
+    if (@{$paramSec->{REQNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004;
+        push @paramData, @{$paramSec->{REQNTID}};
+    }
+    if (@{$paramSec->{MAXNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504;
+        push @paramData, @{$paramSec->{MAXNTID}};
+    }
+
+    if (@$stackSize)
+    {
+        push @paramData, (scalar(@$stackSize) << 18) | 0x1e04;
+        push @paramData, @$stackSize;
+    }
+
+    my $newParamSize  = scalar(@paramData)*4;
+    $paramSec->{Data} = unpack "H*", pack "L*", @paramData;
+    if ($newParamSize != $paramSec->{size})
+    {
+        print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n";
+        $cubin->updateSize($paramSec, $newParamSize);
+    }
+
+    if ($newSize != $kernelSec->{size})
+    {
+        print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n";
+        $cubin->updateSize($kernelSec, $newSize, 1);
+    }
+}
+
+sub updateSize
+{
+    my ($cubin, $sec, $newSize, $updatePrgSize) = @_;
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # update section header
+    my $delta = $newSize - $sec->{size};
+    $sec->{size} = $newSize;
+
+    # update symtab section
+    if ($sec->{SymbolEnt})
+    {
+        $sec->{SymbolEnt}{size} = $newSize;
+        my $symSection = $cubin->{'.symtab'};
+        $symSection->{Data} = '';
+        foreach my $symEnt (@{$symSection->{SymTab}})
+        {
+            $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}};
+        }
+    }
+
+    my $pos = $elfHdr->{ehSize};
+    my %sizeMap;
+
+    # update section header offsets
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # skip first header
+        next if $secHdr->{align} == 0;
+
+        # NOBITS data sections are size 0
+        my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size};
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pos += $secHdr->{align} - $pad;
+        }
+        # map old offset to new
+        $sizeMap{$secHdr->{offset}} = $pos;
+
+        # update offset
+        $secHdr->{offset} = $pos;
+
+        # advance position by size
+        $pos += $size;
+    }
+
+    # compute total section header size
+    my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset};
+
+    # map old offset to new
+    $sizeMap{$elfHdr->{shOffset}} = $pos;
+    $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize;
+
+    $elfHdr->{shOffset} = $pos;
+    $elfHdr->{phOffset} = $pos + $shSize;
+
+    # update program header offsets and sizes
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        # Not sure how best to adjust these so just assume they'll track other offsets.
+        $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}};
+
+        # If the kernel sizes changes, also update the associated ProgramHeader.
+        # Note that this size is the kernel size plus any constant section sizes.
+        if ($updatePrgSize && $prgHdr->{type} == 1 &&
+            $sec->{offset} >= $prgHdr->{offset} &&
+            $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta)
+        {
+            $prgHdr->{fileSize} += $delta;
+            $prgHdr->{memSize}  += $delta;
+        }
+    }
+}
+
+# Write out the cubin after modifying it.
+sub write
+{
+    my ($cubin, $file) = @_;
+
+    open my $fh, ">$file" or die "Error: could not open $file for writing: $!";
+    binmode($fh);
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # write elf header
+    print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}};
+    my $pos = $elfHdr->{ehSize};
+
+    # write section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # Skip NULL and NOBITS data sections
+        next if $secHdr->{size} == 0 || $secHdr->{type} == 8;
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pad = $secHdr->{align} - $pad;
+            print $fh join '', "\0" x $pad;
+            $pos += $pad;
+        }
+
+        print $fh pack 'H*', $secHdr->{Data};
+        $pos += $secHdr->{size};
+    }
+
+    # write section headers
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}};
+    }
+
+    #write program headers
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}};
+    }
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/MaxAs/lib/MaxAs/MaxAs.pm b/Assembler/MaxAs/lib/MaxAs/MaxAs.pm
new file mode 100644
index 0000000..ac79952
--- /dev/null
+++ b/Assembler/MaxAs/lib/MaxAs/MaxAs.pm
@@ -0,0 +1,2105 @@
+package MaxAs::MaxAs;
+
+require 5.10.0;
+
+use strict;
+use Data::Dumper;
+use MaxAs::MaxAsGrammar;
+use File::Spec;
+use Carp;
+use POSIX;
+use List::Util qw[min max];
+
+our $VERSION = '1.06';
+
+# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+my %relOffset  = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT);
+
+# these ops use absolute addresses
+my %absOffset  = map { $_ => 1 } qw(JCAL);
+
+my %jumpOp     = (%relOffset, %absOffset);
+
+# These instructions use r0 but do not write to r0
+my %noDest     = map { $_ => 1 } qw(ST STG STS STL RED);
+
+# Map register slots to reuse control codes
+my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4);
+
+# break the registers down into source and destination categories for the scheduler
+my %srcReg   = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X);
+my %destReg  = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC);
+my %regops   = (%srcReg, %destReg);
+my @itypes   = qw(class lat rlat tput dual);
+
+# init resource usage
+my $activeWarp = 1;
+my $scheduler = 2;
+my $warpSize = 32;
+my $bankWidth = 4;
+my $maxThreads = 1024;
+my $maxSharedMem = 49152;
+my $maxReg = 65536;
+
+my $AnalyzeRe = qr'^[\t ]*<ANALYZE_BLOCK>(.*?)^\s*</ANALYZE_BLOCK>\n?'ms;
+
+sub Occupancy
+{
+    my ($fileName) = @_;
+
+    print "Occupancy\n";
+
+    local $/ = "\n";
+    open my $fh, "<", $fileName or die "Cannot open: ", $fileName;
+    my $usedThreads = <$fh>;
+    chomp $usedThreads;
+    $usedThreads =~ s/threads=//g;
+
+    my $usedSharedMem = <$fh>;
+    chomp $usedSharedMem;
+    $usedSharedMem =~ s/shared=//g;
+
+    my $usedReg = <$fh>;
+    chomp $usedReg;
+    $usedReg =~ s/regs=//g;
+
+    my $activeBlock = min(ceil($maxThreads / $usedThreads),
+      ceil($maxSharedMem / $usedSharedMem), ceil(ceil($maxReg / $usedReg) / $usedThreads));
+    $activeWarp = $activeBlock * ceil($usedThreads / $warpSize);
+
+    print "Active Blocks: ", $activeBlock, "\n";
+    print "Active Warps: ", $activeWarp, "\n\n\n";
+    close $fh;
+}
+
+sub LongestPath
+{
+    my ($instructs) = @_;
+
+    # calculate longest path
+    my @path;
+    foreach my $i (0 .. $#$instructs)
+    {
+        push @path, 0;
+    }
+
+    foreach my $i (0 .. $#$instructs)
+    {
+        my $instruct = $instructs->[$i];
+        foreach my $child (@{$instruct->{children}}) {
+            my $ins = @$child[0];
+            my $weight = @$child[1];
+            $path[$ins] = $weight + $path[$i] if $weight + $path[$i] > $path[$ins];
+        }
+    }
+
+    my $longestPath = 0;
+    foreach my $i (0 .. $#$instructs)
+    {
+        $longestPath = $path[$i] if $path[$i] > $longestPath;
+    }
+
+    return $longestPath;
+}
+
+sub PreprocessBlock
+{
+    my ($analyzeBlock) = @_;
+    my ($lineNum, @instructs, @branches, %labels);
+
+    # push first dummy instruct
+    push @instructs, {dualCnt=>0, nodual=>1};
+
+    # Preprocess instructions
+    foreach my $line (split "\n", $analyzeBlock)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        next unless preProcessLine($line);
+
+        # Match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # Save us from crashing the display driver
+            die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n"
+                if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0;
+
+            # track branches/jumps/calls/etc for label remapping
+            push @branches, @instructs+0 if exists $jumpOp{$inst->{op}};
+
+            # add the op name and full instruction text
+            push @instructs, $inst;
+        }
+        # Match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            # map the label name to the index of the instruction about to be inserted
+            $labels{$1} = @instructs+0;
+        }
+        else
+        {
+            die "badly formed line at $lineNum: $line\n";
+        }
+    }
+
+    # remap labels
+    foreach my $i (@branches)
+    {
+        if (exists $relOffset{$instructs[$i]{op}})
+        {
+            $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e;
+        }
+        else
+        {
+            $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e;
+        }
+    }
+
+    return @instructs;
+}
+
+sub CalculateEfficiency
+{
+    my ($instructs) = @_;
+    print "Instructions\tDispatches\tEcompute\tEcmp\tEmem\n";
+
+    # Analyze efficiency
+    foreach my $i (0 .. $#$instructs) 
+    {
+        my $instruct = $instructs->[$i];
+        $instruct->{dualCnt} = 0;
+        $instruct->{nodual} = 1;
+
+        next unless $i != 0;
+
+        my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)};
+  
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $capData = parseInstruct($inst, $gram) or next;
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+            $instruct->{dualCnt} = $instruct->{dual} ? 1 : 0;
+
+            # Handle P2R and R2P specially
+            if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7})
+            {
+                # These instructions can't be dual issued
+                $instruct->{nodual} = 1;
+            }
+
+            # For pascal and maxwell
+            my $dispatches = 1;
+            my $instructType = $gram->{type}; 
+            if ($instructType->{class} eq 'x32' || $instructType->{class} eq 's2r' ||
+                $instructType->{class} eq 'qtr' || $instructType->{class} eq 'rro' ||
+                $instructType->{class} eq 'vote')
+            {
+                my $units = $instructType->{units};
+                $instruct->{efficiency} = 1 / ceil(($dispatches * $warpSize) / $units);
+            }
+            elsif ($instructType->{class} eq 'shift' || $instructType->{class} eq 'cmp')
+            {
+                my $units = $instructType->{units};
+                my $tput = $instructType->{tput};
+                $instruct->{efficiency} = 1 / (ceil(($dispatches * $warpSize) / $units) * $tput);
+            }
+            elsif ($instructType->{class} eq 'mem')
+            {
+                my $units = $instructType->{units};
+                my $memType = $capData->{type};
+                my $issue = 1;
+                # vector instruction
+                if ($memType =~ s/^\.//g)
+                {
+                    $issue *= $memType / $warpSize;
+                }
+                # TODO(keren): cache instruction ???
+                if ($op eq 'LDG')
+                {
+                    $issue = 1;
+                }
+                $instruct->{efficiency} = 1 / ceil(($dispatches * $warpSize) / $units * $issue);
+            }
+            else
+            {
+                die "No such instruct type: ", Dumper($instruct);
+            }
+            if ($i > 1 and $instruct->{dual}) {
+                my ($prevOp) = @{$instructs->[$i - 1]}{qw(op)};
+                foreach my $prevGram (@{$grammar{$prevOp}})
+                {
+                    #TODO(keren): not noly same class, but also same units
+                    if ($prevGram->{type}->{class} eq $instructType->{class}) 
+                    {
+                        #TODO(keren): ceil?
+                        $instructs->[$i - 1]->{efficiency} = $instruct->{efficiency} =
+                          1 / 2 * $instruct->{efficiency};
+                    }
+                }
+            }
+        }
+    }
+    foreach my $i (0 .. $#$instructs) 
+    {
+        next unless $i > 0;
+
+        my $instruct = $instructs->[$i];
+        my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)};
+  
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $dispatches = 1;
+            print "\t" if $instruct->{dualCnt};
+            print $inst, "\t", $dispatches, "\t";
+            my $instructType = $gram->{type}; 
+            if ($instructType->{class} eq 'x32' || $instructType->{class} eq 's2r' ||
+                $instructType->{class} eq 'qtr' || $instructType->{class} eq 'rro' ||
+                $instructType->{class} eq 'vote')
+            {
+                print $instruct->{efficiency}, "\t0\t0";
+            }
+            elsif ($instructType->{class} eq 'shift' || $instructType->{class} eq 'cmp')
+            {
+                print "0\t", $instruct->{efficiency}, "\t0";
+            }
+            elsif ($instructType->{class} eq 'mem')
+            {
+                # TODO(keren): simulate
+                print "0\t0\t", $instruct->{efficiency};
+            }
+            else
+            {
+                die "No such instruct type: ", Dumper($instruct);
+            }
+            print "\n";
+        }
+    }
+}
+
+sub AnalyzeDAG
+{
+    my ($instructs, $effInstructs, $regMap) = @_;
+    my $vectors = $regMap->{__vectors};
+    my %deps;
+
+    # efficiency dependencies
+    foreach my $i (0 .. $#$instructs)
+    {
+        next unless $i != 0;
+        my $instruct = $instructs->[$i];
+        foreach my $gram (@{$grammar{$instruct->{op}}})
+        {
+            my $parent = $instructs->[$i - 1];
+            my $effParent = $effInstructs->[$i - 1];
+            my $instructType = $gram->{type};
+            if ($parent->{dualCnt} == 1) # parent dual
+            {
+                if ($instruct->{dualCnt} == 0) # links to parent and grandparent
+                {
+                    my $grandparent = $instructs->[$i - 2];
+                    my $effGrandparent = $effInstructs->[$i - 2];
+                    push @{$parent->{children}}, [$i, 1 / $instruct->{efficiency}];
+                    push @{$grandparent->{children}}, [$i, 1 / $instruct->{efficiency}];
+                    push @{$effParent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType];
+                    push @{$effGrandparent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}];
+                }
+                else # not recommend issue pattern, TODO(keren): cannot dual in this way?
+                {
+                    my $grandparent = $instructs->[$i - 2];
+                    my $effGrandparent = $effInstructs->[$i - 2];
+                    if ($grandparent->{dualCnt} == 0)
+                    { # links to grandparent and parent
+                        push @{$parent->{children}}, [$i, 1 / $instruct->{efficiency}];
+                        push @{$grandparent->{children}}, [$i, 1 / $instruct->{efficiency}];
+                        push @{$effParent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}];
+                        push @{$effGrandparent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}];
+                    }
+                    else
+                    { # links to parent becuase it is illegal
+                        push @{$parent->{children}}, [$i, 1 / $instruct->{efficiency}];
+                        push @{$effParent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}];
+                    }
+                }
+            }
+            elsif ($parent->{dualCnt} == 0) # parent single
+            {
+                if ($instruct->{dualCnt} == 0) # links to parent
+                {
+                    push @{$parent->{children}}, [$i, 1 / $instruct->{efficiency}];
+                    push @{$effParent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}];
+                }
+                else # links to grandparent
+                {
+                    my $grandparent = $instructs->[$i - 2];
+                    my $effGrandparent = $effInstructs->[$i - 2];
+                    push @{$grandparent->{children}}, [$i, 1 / $instruct->{efficiency}];
+                    push @{$effGrandparent->{children}}, [$i, 1 / $instruct->{efficiency}, $instructType->{class}];
+                }
+            }
+        }
+    }
+
+    foreach my $i (0 .. $#$instructs)
+    {
+        next unless $i != 0;
+
+        #skip control instructions
+        my $instruct = $instructs->[$i];
+        my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)};
+
+        # write dependencies
+        my $match = 0;
+        foreach my $gram (@{$grammar{$instruct->{op}}})
+        {
+            my $capData = parseInstruct($instruct->{inst}, $gram) or next;
+            my (@dest, @src);
+
+            # copy over instruction types for easier access
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+
+            # A predicate prefix is treated as a source reg
+            push @src, $instruct->{predReg} if $instruct->{pred};
+
+            # Handle P2R and R2P specially
+            if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7})
+            {
+                my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src;
+                my $mask = hex($capData->{i20w7});
+                foreach my $p (0..6)
+                {
+                    if ($mask & (1 << $p))
+                    {
+                        push @$list, "P$p";
+                    }
+                    # make this instruction dependent on any predicates it's not setting
+                    # this is to prevent a race condition for any predicate sets that are pending
+                    elsif ($instruct->{op} eq 'R2P')
+                    {
+                        push @src, "P$p";
+                    }
+                }
+                # These instructions can't be dual issued
+                $instruct->{nodual} = 1;
+            }
+            # Populate our register source and destination lists, skipping any zero or true values
+            foreach my $operand (grep { exists $regops{$_} } sort keys %$capData)
+            {
+                # figure out which list to populate
+                my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src;
+
+                # Filter out RZ and PT
+                my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT';
+
+                if ($capData->{$operand} ne $badVal)
+                {
+                    # add the value to list with the correct prefix
+                    push @$list,
+                        $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) :
+                        $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) :
+                        $operand eq 'CC' ? 'CC' :
+                        $operand eq 'X'  ? 'CC' :
+                        getRegNum($regMap, $capData->{$operand});
+                }
+            }
+
+            # Find Read-After-Write dependencies
+            foreach my $src (grep { exists $deps{$_} } @src)
+            {
+                # the parent should be the most recently added dest op to the stack
+                foreach my $parent (@{$deps{$src}})
+                {
+                    # add this instruction as a child of the parent
+                    # set the edge to the total latency of reg source availability
+                    #print "R $parent->{inst}\n\t\t$instruct->{inst}\n";
+                    my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat};
+                    # update weights
+                    my $find = 0;
+                    foreach my $child (@{$parent->{children}})
+                    {
+                        my $ins = $instructs->[$child->[0]];
+                        my $weight = $child->[1];
+                        if ($ins eq $instruct)
+                        {
+                            $child->[1] = $weight > $latency ? $weight : $latency;
+                            $find = 1;
+                            last;
+                        }
+                    }
+                    # parent and child does not has efficiency dependency
+                    if ($find == 0)
+                    {
+                         push @{$parent->{children}}, [$i, $latency];
+                    }
+                    $instruct->{parents}++;
+
+                    # if the destination was conditionally executed, we also need to keep going back till it wasn't
+                    last unless $parent->{pred};
+                }
+            }
+
+            # For a dest reg, push it onto the write stack
+            unshift @{$deps{$_}}, $instruct foreach @dest;
+
+            $match = 1;
+            last;
+        }
+
+        die "Unable to recognize instruction: $instruct->{inst}\n" unless $match;
+    }
+}
+
+sub ConstructEfficiencyDAG
+{
+    my ($effInstructs, $typeInstructs, $types) = @_;
+
+    foreach my $i (0 .. $#$effInstructs)
+    {
+        my $instruct = $effInstructs->[$i];
+        my $typeInstruct = $typeInstructs->[$i];
+
+        foreach my $child (@{$instruct->{children}})
+        {
+            my $instructType = $child->[2];
+
+            my $find = 0;
+            my $weight = 0;
+            foreach my $type (@$types) 
+            {
+                if ($instructType eq $type)
+                {
+                    $weight = $child->[1];
+                }
+            }
+            push @{$typeInstruct->{children}}, [$child->[0], $weight];
+        }
+    }
+}
+
+sub CalculateBcomp
+{
+    my ($instructs) = @_;
+    # Bcomp
+    my $unitsSum = 0;
+    my $unitsUse = 0;
+
+    foreach my $i (0 .. $#$instructs)
+    {
+        next unless $i != 0;
+
+        my $instruct = $instructs->[$i];
+        my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)};
+  
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $dispatches = 1;
+            my $instructType = $gram->{type}; 
+            if ($instructType->{class} eq 'x32' || $instructType->{class} eq 's2r' || $instructType->{class} eq 'qtr' ||
+                $instructType->{class} eq 'rro' || $instructType->{class} eq 'vote') {
+                $unitsSum = $unitsSum +  $instructType->{units};
+                $unitsUse = $unitsUse + $dispatches * $warpSize  
+            }
+        }
+    }
+    print "Bcomp: ", $unitsSum > 0 ? 1.0 - $unitsUse / $unitsSum : 0, "\n";
+}
+
+sub CalculateBmem
+{
+    my ($instructs) = @_;
+    # Bmem
+    my $sharedWidthSum = 0;
+    my $sharedWidthUse = 0;
+    my $globalWidthSum = 0;
+    my $globalWidthUse = 0;
+    foreach my $i (0 .. $#$instructs)
+    {
+        next unless $i != 0;
+
+        my $instruct = $instructs->[$i];
+        my ($op, $inst) = @{$instructs->[$i]}{qw(op inst)};
+  
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            my $capData = parseInstruct($inst, $gram) or next;
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+            my $dispatches = 1;
+            my $instructType = $gram->{type}; 
+            if ($instructType->{class} eq 'mem') {
+                my $memType = $capData->{type};
+                # default 32 bit
+                my $insWidth = 4;
+                # vector instruction
+                if ($memType =~ s/^\.//g) {
+                    $insWidth = $memType / 8;
+                }
+                if ($instructType->{type} eq 'global') {
+                    $globalWidthSum = $globalWidthSum + 16 * $warpSize; # LDG.128
+                    #TODO cache 
+                    if ($op eq 'LDG') {
+                        $globalWidthUse = $globalWidthUse + $insWidth * $warpSize;
+                    } else {
+                        $globalWidthUse = $globalWidthUse + ($insWidth / ceil($insWidth / 4)) * $warpSize;
+                    }
+                } else { #shared 
+                    $sharedWidthSum = $sharedWidthSum + $bankWidth * $warpSize;
+                    $sharedWidthUse = $sharedWidthUse + ($insWidth / ceil($insWidth / $bankWidth)) * $warpSize;
+                }
+            }
+        }
+    }
+    print "Bshared: ", $sharedWidthSum > 0 ? 1.0 - $sharedWidthUse / $sharedWidthSum : 0, "\n";
+    print "Bglobal: ", $globalWidthSum > 0 ? 1.0 - $globalWidthUse / $globalWidthSum : 0, "\n";
+}
+
+sub CalculateBilp
+{
+    # efficiency dependencies for each unit
+    # TODO(keren): analyze more units
+    my ($effInstructs, $cweff) = @_;
+
+    my @x32type = ('s2r', 'x32', 'shift', 'cmp', 'vote');
+    my @x64type = ('x64');
+    my @sptype = ('qtr', 'rro');
+    my @memtype = ('mem');
+
+    my @x32Instructs;
+    my @x64Instructs;
+    my @spInstructs;
+    my @memInstructs;
+
+    foreach my $i (0 .. $#$effInstructs)
+    {
+        push @x32Instructs, {};
+        push @x64Instructs, {};
+        push @spInstructs, {};
+        push @memInstructs, {};
+    }
+
+    ConstructEfficiencyDAG($effInstructs, \@x32Instructs, \@x32type);
+    ConstructEfficiencyDAG($effInstructs, \@x64Instructs, \@x64type);
+    ConstructEfficiencyDAG($effInstructs, \@spInstructs, \@sptype);
+    ConstructEfficiencyDAG($effInstructs, \@memInstructs, \@memtype);
+    
+    my $cx32eff = LongestPath(\@x32Instructs);
+    my $cx64eff = LongestPath(\@x64Instructs);
+    my $cspeff = LongestPath(\@spInstructs);
+    my $cmemeff = LongestPath(\@memInstructs);
+    my $maxeff = max($cx32eff, $cspeff, $cx64eff, $cmemeff);
+    #print "cx32eff: ", $cx32eff, "\n";
+    #print "cx64eff: ", $cx64eff, "\n";
+    #print "csp32eff: ", $cspeff, "\n";
+    #print "cmemeff: ", $cmemeff, "\n";
+
+    print "Bilp: ", $cweff > 0 ? 1.0 - $maxeff / $cweff : 0, "\n";
+}
+
+# Bpipe
+# push longest path
+sub CalculateBpipe
+{
+    my ($instructs, $cweff) = @_;
+
+    my @path;
+    foreach my $i (0 .. $#$instructs)
+    {
+        $path[$i] = 0;
+    }
+
+    foreach my $i (0 .. $#$instructs)
+    {
+        my $instruct = $instructs->[$i];
+        foreach my $child (@{$instruct->{children}})
+        {
+            my $iChild= $child->[0];
+            my $weight = $child->[1];
+            if ($weight + $path[$i] > $path[$iChild])
+            {
+                $path[$iChild] = $weight + $path[$i];
+                my $ins = $instructs->[$iChild];
+                $ins->{prev} = {prevInstruct=>$instruct, prevWeight=>$weight};
+            }
+        }
+    }
+
+    my $longestPath = 0;
+    foreach my $i (0 .. $#$instructs)
+    {
+        $longestPath = $path[$i] if $path[$i] > $longestPath;
+    }
+
+    my $longestLatency = 0;
+    foreach my $i (0 .. $#$instructs) 
+    {
+        my $instruct = $instructs->[$i];
+        my $latencies = 0;
+        if ($path[$i] == $longestPath)
+        {
+            while (defined($instruct->{prev}))
+            {
+                my $prevIns = $instruct->{prev}->{prevInstruct};
+                my $prevWeight = $instruct->{prev}->{prevWeight};
+                my $prevLat = $prevIns->{lat};
+                if ($prevLat == $prevWeight) 
+                {
+                    $latencies = $latencies + $prevWeight;
+                }
+                $instruct = $prevIns;
+            }
+        }
+        $longestLatency = $latencies if $latencies > $longestLatency;
+    }
+    my $eff = $cweff * $activeWarp / $scheduler;
+    print "Bpipe: ", $eff > 0 ? $longestLatency / $eff : 0, "\n";
+}
+
+sub Analyze
+{
+    # 1. Read two files, architecture configurations and software resource usage
+    # 2. Output each instruction, and its efficiency
+    # 3. Identify the critical path
+    # 4. Compute bottlenecks
+    my ($file, $include) = @_;
+  
+    my $regMap = {};
+    $file = Preprocess($file, $include, 0, $regMap, 1);
+
+    # Extract analyze block
+    my @analyzeBlocks = $file =~ /$AnalyzeRe/g;
+
+    # Iterate over analyz blocks
+    foreach my $i (0 .. $#analyzeBlocks)
+    {
+        print "Analyze block $i\n\n";
+
+        # Preprocess instructs
+        my @instructs = PreprocessBlock($analyzeBlocks[$i]);
+
+        # Calculate each instruction's efficiency
+        CalculateEfficiency(\@instructs);
+
+        # Analyze DAG dependencies
+        # Init eff instructs
+        my @effInstructs;
+        foreach my $ins (@instructs)
+        {
+            push @effInstructs, {};
+        }
+        AnalyzeDAG(\@instructs, \@effInstructs, $regMap);
+
+        # calculate longest path
+        my $predictedCycle = LongestPath(\@instructs);
+        print "predict cycles $predictedCycle\n";
+
+        ## bottleneck analyze
+        CalculateBcomp(\@instructs);
+        CalculateBmem(\@instructs);
+        my $cweff = LongestPath(\@effInstructs);
+        CalculateBilp(\@effInstructs, $cweff);
+        CalculateBpipe(\@instructs, $cweff);
+
+        print "\n\n";
+    }
+
+    print "End analyze\n";
+}
+
+# Preprocess and Assemble a source file
+sub Assemble
+{
+    my ($file, $include, $doReuse, $nowarn) = @_;
+
+    my $regMap = {};
+    $file = Preprocess($file, $include, 0, $regMap, 0);
+    my $vectors = delete $regMap->{__vectors};
+    my $regBank = delete $regMap->{__regbank};
+
+    # initialize cubin counts
+    my $regCnt = 0;
+    my $barCnt = 0;
+
+    my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse);
+
+    # initialize the first control instruction
+    push @instructs, $ctrl = {};
+
+    foreach my $line (split "\n", $file)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        next unless preProcessLine($line);
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # Save us from crashing the display driver
+            die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n"
+                if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0;
+
+            # track branches/jumps/calls/etc for label remapping
+            push @branches, @instructs+0 if exists $jumpOp{$inst->{op}};
+
+            # push the control code onto the control instruction
+            push @{$ctrl->{ctrl}}, $inst->{ctrl};
+
+            # now point the instruction to its associated control instruction
+            $inst->{ctrl} = $ctrl;
+
+            # add the op name and full instruction text
+            push @instructs, $inst;
+
+            # add a 4th control instruction for every 3 instructions
+            push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            # map the label name to the index of the instruction about to be inserted
+            $labels{$1} = @instructs+0;
+        }
+        else
+        {
+            die "badly formed line at $lineNum: $line\n";
+        }
+    }
+    # add the final BRA op and align the number of instructions to a multiple of 8
+    push @{$ctrl->{ctrl}}, 0x007ff;
+    push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' };
+    while (@instructs & 7)
+    {
+        push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        push @{$ctrl->{ctrl}}, 0x007e0;
+        push @instructs, { op => 'NOP', inst => 'NOP;' };
+    }
+
+    # remap labels
+    foreach my $i (@branches)
+    {
+        if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1})
+            { die "instruction has invalid label: $instructs[$i]{inst}"; }
+
+        $instructs[$i]{jump} = $labels{$1};
+
+        if (exists $relOffset{$instructs[$i]{op}})
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; }
+        else
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; }
+    }
+
+    # calculate optimal register reuse
+    # This effects register bank decisions so do it before analyzing register use
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            if ($doReuse)
+            {
+                # get any vector registers for r0
+                my @r0 = getVecRegisters($vectors, $capData);
+
+                # There are 2 reuse slots per register slot
+                # The reuse hash points to most recent instruction index where register was last used in this slot
+
+                # For writes to a register, clear any reuse opportunity
+                if (@r0 && !exists $noDest{$op})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        if (my $reuse = $reuse{$slot})
+                        {
+                            # if writing with a vector op, clear all linked registers
+                            delete $reuse->{$_} foreach @r0;
+                        }
+                    }
+                }
+                # clear cache if jumping elsewhere
+                %reuse = () if exists $jumpOp{$op};
+
+                # only track register reuse for instruction types this works with
+                if ($gram->{type}{reuse})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        next unless exists $capData->{$slot};
+
+                        my $r = $capData->{$slot};
+                        next if $r eq 'RZ';
+                        next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction
+
+                        my $reuse = $reuse{$slot} ||= {};
+
+                        # if this register was previously marked for potential reuse
+                        if (my $p = $reuse->{$r})
+                        {
+                            # flag the previous instruction's ctrl reuse array slot
+                            $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot};
+
+                            #print "reuse $slot $r $instructs[$p]{inst}\n";
+                        }
+                        # list full, delete the oldest
+                        elsif (keys %$reuse > 2)
+                        {
+                            my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0];
+                            delete $reuse->{$oldest};
+                        }
+                        # mark the new instruction for potential reuse
+                        $reuse->{$r} = $i;
+                    }
+                }
+            }
+            # if reuse is disabled then pull value from code.
+            elsif ($gram->{type}{reuse})
+            {
+                $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData);
+            }
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # Assign registers to requested banks if possible
+    foreach my $r (sort keys %$regBank)
+    {
+        my $bank  = $regBank->{$r};
+        my $avail = $regMap->{$r};
+        foreach my $pos (0 .. $#$avail)
+        {
+            if ($bank == ($avail->[$pos] & 3))
+            {
+                # assign it, while removing the assigned register from the pool
+                $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+                last;
+            }
+        }
+    }
+
+    # calculate register live times and preferred banks for non-fixed registers.
+    # LiveTime only half implemented...
+    my (%liveTime, %pairedBanks, %reuseHistory);
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData   = parseInstruct($inst, $gram) or next;
+            my $reuseType = $gram->{type}{reuse};
+
+            # liveTimes and bank conflicts with source operands
+            my (%addReuse, %delReuse);
+            foreach my $slot (qw(r8 r20 r39))
+            {
+                my $r = $capData->{$slot} or next;
+                next if $r eq 'RZ';
+
+                my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r};
+
+                # All registers should be written prior to being read..
+                if (my $liveTime = $liveTime{$liveR})
+                {
+                    # for each read set the current instruction index as the high value
+                    $liveTime->[$#$liveTime][1] = $i;
+                    push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                }
+                else
+                {
+                    warn "register used without initialization ($r): $inst\n" unless $nowarn;
+                    push @{$liveTime{$liveR}}, [$i,$i];
+                }
+
+                # Is this register active in the reuse cache?
+                my $slotHist  = $reuseHistory{$slot} ||= {};
+                my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0;
+
+                #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3';
+
+                # If this is an auto reg, look at the open banks.
+                # No need to look at banks if this register is in the reuse cache.
+                if (!$selfReuse && ref $regMap->{$r})
+                {
+                    # Look at other source operands in this instruction and flag what banks are being used
+                    foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39))
+                    {
+                        my $r2 = $capData->{$slot2};
+                        next if $r2 eq 'RZ' || $r2 eq $r;
+
+                        my $slotHist2 = $reuseHistory{$slot2} ||= {};
+
+                        #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3';
+
+                        # Dont be concerned with non-reuse type instructions or
+                        # If this operand is in the reuse cache, we don't care what bank it's on.
+                        if (!$reuseType || !exists $slotHist2->{$r2})
+                        {
+                            # if the operand is also an auto-allocated register then link them
+                            # Once we choose the bank for one we want to update that choice for the other register.
+                            if (ref $regMap->{$r2})
+                            {
+                                push @{$pairedBanks{$r}{pairs}}, $r2;
+                                $pairedBanks{$r}{banks} ||= [];
+                            }
+                            # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid.
+                            else
+                            {
+                                my $bank = substr($regMap->{$r2},1) & 3;
+                                #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3';
+
+                                $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++;
+                                $pairedBanks{$r}{pairs} ||= [];
+                            }
+                            # Update the total use count for this register.
+                            # This will be the number of times the register is pulled out of the bank.
+                            $pairedBanks{$r}{useCnt}++;
+                        }
+                    }
+                }
+                # update the reuse history so we know which bank conflicts we can ignore.
+                if ($reuseType)
+                {
+                    # flag these slots for addition or removal from reuseHistory
+                    if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot})
+                        { $addReuse{$slot} = $r; }
+                    else
+                        { $delReuse{$slot} = $r; }
+                }
+            }
+            # update reuse history after we're done with the instruction (when the flag is actually in effect).
+            # we don't want to updated it in the middle since that can interfere with the checks,
+            $reuseHistory{$_}{$addReuse{$_}} = 1    foreach keys %addReuse;
+            delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse;
+
+            # liveTimes for destination operands and vector registers
+            foreach my $r0 (getVecRegisters($vectors, $capData))
+            {
+                # fixed register mappings can have aliases so use the actual register value for those.
+                my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0};
+
+                # If not writing treat just like a read
+                if (exists $noDest{$op})
+                {
+                    if (my $liveTime = $liveTime{$liveR})
+                    {
+                        $liveTime->[$#$liveTime][1] = $i;
+                        push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                    }
+                    else
+                    {
+                        warn "register used without initialization ($r0): $inst\n" unless $nowarn;
+                        push @{$liveTime{$liveR}}, [$i,$i];
+                    }
+                }
+                # If writing, push a new bracket on this register's stack.
+                elsif (my $liveTime = $liveTime{$liveR})
+                {
+                    if ($i > $liveTime->[$#$liveTime][1])
+                    {
+                        push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                    }
+                }
+                else
+                {
+                    # Initialize the liveTime stack for this register.
+                    push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                }
+            }
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+    #print Dumper(\%liveTime); exit(1);
+
+    # assign unassigned registers
+    # sort by most restricted, then most used, then name
+    foreach my $r (sort {
+                    $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} ||
+                    $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} ||
+                    $a cmp $b
+                  } keys %pairedBanks)
+    {
+        my $banks = $pairedBanks{$r}{banks};
+        my $avail = $regMap->{$r};
+
+        #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail;
+
+        # Pick a bank with zero or the smallest number of conflicts
+        BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3))
+        {
+            # pick an available register that matches the requested bank
+            foreach my $pos (0 .. $#$avail)
+            {
+                if ($bank == ($avail->[$pos] & 3))
+                {
+                    # assign it, while removing the assigned register from the pool
+                    $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+
+                    # update bank info for any unassigned pair
+                    $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}};
+                    last BANK;
+                }
+            }
+        }
+    }
+    # Now assign any remaining to first available
+    foreach my $r (sort keys %$regMap)
+    {
+        if (ref($regMap->{$r}) eq 'ARRAY')
+        {
+            $regMap->{$r} = 'R' . shift @{$regMap->{$r}};
+        }
+    }
+    #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap;
+
+    # apply the register mapping and assemble the instructions to op codes
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        # save the original and replace the register names with numbers
+        $instructs[$i]{orig} = $instructs[$i]{inst};
+        $instructs[$i]{inst} =~ s/(?<!\.)\b(\w+)\b(?!\[)/ exists($regMap->{$1}) ? $regMap->{$1} : $1 /ge;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            # update the register count
+            foreach my $r (qw(r0 r8 r20 r39))
+            {
+                next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ';
+
+                # get numeric portion of regname
+                my $val = substr $capData->{$r}, 1;
+
+                my @r0 = getVecRegisters($vectors, $capData);
+                my @r8 = getAddrVecRegisters($vectors, $capData);
+
+                # smart enough to count vector registers for memory instructions.
+                my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1;
+                my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1;
+
+                if ($val + $regInc > $regCnt)
+                {
+                    $regCnt = $val + $regInc;
+                    #print "$val $regCnt $regInc\n";
+                }
+            }
+            # update the barrier resource count
+            if ($op eq 'BAR')
+            {
+                if (exists $capData->{i8w4})
+                {
+                    $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt;
+                }
+                # if a barrier value is a register, assume the maximum
+                elsif (exists $capData->{r8})
+                {
+                    $barCnt = 16;
+                }
+            }
+            # Generate the op code.
+            my ($code, $reuse) = genCode($op, $gram, $capData);
+            $instructs[$i]{code} = $code;
+
+            # cache this for final pass when we want to calculate reuse stats.
+            if ($gram->{type}{reuse})
+                { $instructs[$i]{caps} = $capData; }
+            # use the parsed value of reuse for non-reuse type instructions
+            else
+                { $ctrl->{reuse}[($i & 3) - 1] = $reuse; }
+
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # final pass to piece together control codes
+    my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed);
+    foreach my $i (0 .. $#instructs)
+    {
+        # op code
+        if ($i & 3)
+        {
+            push @codes, $instructs[$i]{code};
+
+            if ($instructs[$i]{caps})
+            {
+                # calculate stats on registers
+                registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn);
+            }
+            if ($instructs[$i]{inst} =~ m'EXIT')
+            {
+                push @exitOffsets, (scalar(@codes)-1)*8;
+            }
+            elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)')
+            {
+                push @ctaidOffsets, (scalar(@codes)-1)*8;
+                $ctaidzUsed = 1 if $1 eq 'Z';
+            }
+        }
+        # control code
+        else
+        {
+            my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)};
+            push @codes,
+                ($ctrl->[0] <<  0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes
+                ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59);  # reuse codes
+        }
+    }
+
+    # return the kernel data
+    return {
+        RegCnt       => $regCnt,
+        BarCnt       => $barCnt,
+        ExitOffsets  => \@exitOffsets,
+        CTAIDOffsets => \@ctaidOffsets,
+        CTAIDZUsed   => $ctaidzUsed,
+        ConflictCnt  => $reuseHistory{conflicts},
+        ReuseCnt     => $reuseHistory{reuse},
+        ReuseTot     => $reuseHistory{total},
+        ReusePct     => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0),
+        KernelData   => \@codes,
+    };
+}
+
+# Useful for testing op code coverage of existing code, extracting new codes and flags
+sub Test
+{
+    my ($fh, $printConflicts, $all) = @_;
+
+    my @instructs;
+    my %reuseHistory;
+    my ($pass, $fail) = (0,0);
+
+    while (my $line = <$fh>)
+    {
+        my (@ctrl, @reuse);
+
+        next unless processSassCtrlLine($line, \@ctrl, \@reuse);
+
+        foreach my $fileReuse (@reuse)
+        {
+            $line = <$fh>;
+
+            my $inst = processSassLine($line) or next;
+
+            $inst->{reuse} = $fileReuse;
+            my $fileCode = $inst->{code};
+
+            if (exists $relOffset{$inst->{op}})
+            {
+                # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+                $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e;
+            }
+
+            my $match = 0;
+            foreach my $gram (@{$grammar{$inst->{op}}})
+            {
+                my $capData = parseInstruct($inst->{inst}, $gram) or next;
+                my @caps;
+
+                # Run in test mode to list what capture groups were captured
+                my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps);
+
+                # Detect register bank conflicts but only for reuse type instructions.
+                # If a bank conflict is avoided by a reuse flag then ignore it.
+                registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse};
+
+                $inst->{caps}      = join ', ', sort @caps;
+                $inst->{codeDiff}  = $fileCode  ^ $code;
+                $inst->{reuseDiff} = $fileReuse ^ $reuse;
+
+                # compare calculated and file values
+                if ($code == $fileCode && $reuse == $fileReuse)
+                {
+                    $inst->{grade} = 'PASS';
+                    push @instructs, $inst if $all;
+                    $pass++;
+                }
+                else
+                {
+                    $inst->{grade} = 'FAIL';
+                    push @instructs, $inst;
+                    $fail++;
+                }
+                $match = 1;
+                last;
+            }
+            unless ($match)
+            {
+                $inst->{grade}     = 'FAIL';
+                $inst->{codeDiff}  = $fileCode;
+                $inst->{reuseDiff} = $fileReuse;
+                push @instructs, $inst;
+                $fail++;
+            }
+        }
+    }
+    my %maxLen;
+    foreach (@instructs)
+    {
+        $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}};
+    }
+    my ($lastOp, $template);
+    foreach my $inst (sort {
+        $a->{op}        cmp $b->{op}        ||
+        $a->{codeDiff}  <=> $b->{codeDiff}  ||
+        $a->{reuseDiff} <=> $b->{reuseDiff} ||
+        $a->{ins}       cmp $b->{ins}
+        } @instructs)
+    {
+        if ($lastOp ne $inst->{op})
+        {
+            $lastOp   = $inst->{op};
+            $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s   %s\n";
+            printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s   %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures);
+        }
+        printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)};
+    }
+    my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0;
+
+    printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n",
+        $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total};
+
+    return $fail;
+}
+
+# Convert cuobjdump sass to the working format
+sub Extract
+{
+    my ($in, $out, $params) = @_;
+
+    my %paramMap;
+    my %constants =
+    (
+        blockDimX => 'c[0x0][0x8]',
+        blockDimY => 'c[0x0][0xc]',
+        blockDimZ => 'c[0x0][0x10]',
+        gridDimX  => 'c[0x0][0x14]',
+        gridDimY  => 'c[0x0][0x18]',
+        gridDimZ  => 'c[0x0][0x1c]',
+    );
+    print $out "<CONSTANT_MAPPING>\n";
+
+    foreach my $const (sort keys %constants)
+    {
+        print $out "    $const : $constants{$const}\n";
+        $paramMap{$constants{$const}} = $const;
+    }
+    print $out "\n";
+
+    foreach my $p (@$params)
+    {
+        my ($ord,$offset,$size,$align) = split ':', $p;
+
+        if ($size > 4)
+        {
+            my $num = 0;
+            $offset = hex $offset;
+            while ($size > 0)
+            {
+                my $param = sprintf 'param_%d[%d]', $ord, $num;
+                my $const = sprintf 'c[0x0][0x%x]', $offset;
+                $paramMap{$const} = $param;
+                print $out "    $param : $const\n";
+                $size   -= 4;
+                $offset += 4;
+                $num    += 1;
+            }
+        }
+        else
+        {
+            my $param = sprintf 'param_%d', $ord;
+            my $const = sprintf 'c[0x0][%s]', $offset;
+            $paramMap{$const} = $param;
+            print $out "    $param : $const\n";
+        }
+    }
+    print $out "</CONSTANT_MAPPING>\n\n";
+
+    my %labels;
+    my $labelnum = 1;
+
+    my @data;
+    FILE: while (my $line = <$in>)
+    {
+        my (@ctrl, @ruse);
+        next unless processSassCtrlLine($line, \@ctrl, \@ruse);
+
+        CTRL: foreach my $ctrl (@ctrl)
+        {
+            $line = <$in>;
+
+            my $inst = processSassLine($line) or next CTRL;
+
+            # Convert branch/jump/call addresses to labels
+            if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)')
+            {
+                my $target = hex($1);
+
+                # skip the final BRA and stop processing the file
+                last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8);
+
+                # check to see if we've already generated a label for this target address
+                my $label = $labels{$target};
+                unless ($label)
+                {
+                    # generate a label name and cache it
+                    $label = $labels{$target} = "TARGET$labelnum";
+                    $labelnum++;
+                }
+                # replace address with name
+                $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/;
+            }
+            $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg;
+
+            $inst->{ctrl} = printCtrl($ctrl);
+
+            push @data, $inst;
+        }
+    }
+    # make a second pass now that we have the complete instruction address to label mapping
+    foreach my $inst (@data)
+    {
+        print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}};
+        printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)};
+    }
+}
+
+my $CommentRe  = qr'^[\t ]*<COMMENT>.*?^\s*</COMMENT>\n?'ms;
+my $IncludeRe  = qr'^[\t ]*<INCLUDE\s+file="([^"]+)"\s*/?>\n?'ms;
+my $CodeRe     = qr'^[\t ]*<CODE(\d*)>(.*?)^\s*<\/CODE\1>\n?'ms;
+my $ConstMapRe = qr'^[\t ]*<CONSTANT_MAPPING>(.*?)^\s*</CONSTANT_MAPPING>\n?'ms;
+my $RegMapRe   = qr'^[\t ]*<REGISTER_MAPPING>(.*?)^\s*</REGISTER_MAPPING>\n?'ms;
+my $ScheduleRe = qr'^[\t ]*<SCHEDULE_BLOCK>(.*?)^\s*</SCHEDULE_BLOCK>\n?'ms;
+my $InlineRe   = qr'\[(\+|\-)(.+?)\1\]'ms;
+
+sub IncludeFile
+{
+    my ($file, $include) = @_;
+    my ($vol,$dir,$name) = File::Spec->splitpath($file);
+    local $/;
+    my $fh;
+    if (!open $fh, $file)
+    {
+        open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n";
+    }
+    my $content = <$fh>;
+    close $fh;
+    return $content;
+}
+
+sub Preprocess
+{
+    my ($file, $include, $debug, $regMap, $doAnalyze) = @_;
+
+    my $constMap = {};
+    my $removeRegMap;
+    if ($regMap)
+        { $removeRegMap = 1; }
+    else
+        { $regMap = {}; }
+
+    # include nested files
+    1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg;
+
+    # Strip out comments
+    $file =~ s|$CommentRe||g;
+
+    # Execute the CODE sections (old way to run code, to be deprecated)
+    1 while $file =~ s|$CodeRe|
+        my $out = eval "package MaxAs::MaxAs::CODE; $2";
+        $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg;
+
+    # Execute the inline code (new way)
+    $file =~ s|$InlineRe|
+        my ($type, $code) = ($1, $2);
+        my $out = eval "package MaxAs::MaxAs::CODE; $code";
+        $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg;
+
+    #Pull in the constMap
+    $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg;
+
+    my @newFile;
+    foreach my $line (split "\n", $file)
+    {
+        # skip comments
+        if ($line !~ m'^\s*(?:#|//).*')
+        {
+            $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg;
+        }
+        push @newFile, $line;
+    }
+    $file = join "\n", @newFile;
+
+    # Pull in the reg map first as the Scheduler will need it to handle vector instructions
+    # Remove the regmap if we're going on to assemble
+    $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg;
+
+    # Pick out the SCHEDULE_BLOCK sections
+    my @schedBlocks = $file =~ /$ScheduleRe/g;
+
+    # Schedule them
+    foreach my $i (0 .. $#schedBlocks)
+    {
+        # XMAD macros should only appear in SCHEDULE_BLOCKs
+        $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]);
+
+        $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug);
+    }
+
+    # Replace the results
+    $file =~ s|$ScheduleRe| shift @schedBlocks |eg;
+
+    # Strip out analyzeBlocks
+    $file =~ s|$AnalyzeRe||eg if not $doAnalyze;
+
+    return $file;
+}
+
+sub Scheduler
+{
+    my ($block, $blockNum, $regMap, $debug) = @_;
+
+    my $vectors = $regMap->{__vectors};
+    my $lineNum = 0;
+
+    my (@instructs, @comments, $ordered, $first);
+    foreach my $line (split "\n", $block)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        unless (preProcessLine($line))
+        {
+            push @comments, $line if $line =~ m'\S';
+            next;
+        }
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # if the first instruction in the block is waiting on a dep, it should go first.
+            $inst->{first}   = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1;
+
+            # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block)
+            #$inst->{first}   = $inst->{ctrl} & 0x0000f ? 1 : 2;
+            $inst->{exeTime} = 0;
+            $inst->{order}   = $ordered++ if $ordered;
+            $inst->{force_stall} = $inst->{ctrl} & 0xf if $inst->{comment} =~ m'FORCE';
+
+            push @instructs, $inst;
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n";
+        }
+        # open an ORDERED block
+        elsif ($line =~ m'^<ORDERED>')
+        {
+            die "you cannot use nested <ORDERED> tags" if $ordered;
+            $ordered = 1;
+        }
+        # close an ORDERED block
+        elsif ($line =~ m'^</ORDERED>')
+        {
+            die "missing opening <ORDERED> for closing </ORDERED> tag" if !$ordered;
+            $ordered = 0;
+        }
+        else
+        {
+            die "badly formed line at block: $blockNum line: $lineNum: $line\n";
+        }
+    }
+    my (%writes, %reads, @ready, @schedule, $orderedParent);
+    # assemble the instructions to op codes
+    foreach my $instruct (@instructs)
+    {
+        my $match = 0;
+        foreach my $gram (@{$grammar{$instruct->{op}}})
+        {
+            my $capData = parseInstruct($instruct->{inst}, $gram) or next;
+            my (@dest, @src);
+
+            # copy over instruction types for easier access
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+
+            $instruct->{dualCnt} = $instruct->{dual} ? 1 : 0;
+
+            # A predicate prefix is treated as a source reg
+            push @src, $instruct->{predReg} if $instruct->{pred};
+
+            # Handle P2R and R2P specially
+            if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7})
+            {
+                my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src;
+                my $mask = hex($capData->{i20w7});
+                foreach my $p (0..6)
+                {
+                    if ($mask & (1 << $p))
+                    {
+                        push @$list, "P$p";
+                    }
+                    # make this instruction dependent on any predicates it's not setting
+                    # this is to prevent a race condition for any predicate sets that are pending
+                    elsif ($instruct->{op} eq 'R2P')
+                    {
+                        push @src, "P$p";
+                    }
+                }
+                # These instructions can't be dual issued
+                $instruct->{nodual} = 1;
+            }
+
+            # Populate our register source and destination lists, skipping any zero or true values
+            foreach my $operand (grep { exists $regops{$_} } sort keys %$capData)
+            {
+                # figure out which list to populate
+                my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src;
+
+                # Filter out RZ and PT
+                my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT';
+
+                if ($capData->{$operand} ne $badVal)
+                {
+                    # add the value to list with the correct prefix
+                    push @$list,
+                        $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) :
+                        $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) :
+                        $operand eq 'CC' ? 'CC' :
+                        $operand eq 'X'  ? 'CC' :
+                        getRegNum($regMap, $capData->{$operand});
+                }
+            }
+            $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39});
+
+            # Find Read-After-Write dependencies
+            foreach my $src (grep { exists $writes{$_} } @src)
+            {
+                # Memory operations get delayed access to registers but not to the predicate
+                my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat};
+
+                # the parent should be the most recently added dest op to the stack
+                foreach my $parent (@{$writes{$src}})
+                {
+                    # add this instruction as a child of the parent
+                    # set the edge to the total latency of reg source availability
+                    #print "R $parent->{inst}\n\t\t$instruct->{inst}\n";
+                    my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat};
+                    push @{$parent->{children}}, [$instruct, $latency - $regLatency];
+                    $instruct->{parents}++;
+
+                    # if the destination was conditionally executed, we also need to keep going back till it wasn't
+                    last unless $parent->{pred};
+                }
+            }
+
+            # Find Write-After-Read dependencies
+            foreach my $dest (grep { exists $reads{$_} } @dest)
+            {
+                # Flag this instruction as dependent to any previous read
+                foreach my $reader (@{$reads{$dest}})
+                {
+                    # no need to stall for these types of dependencies
+                    #print "W $reader->{inst} \t\t\t $instruct->{inst}\n";
+                    push @{$reader->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                # Once dependence is marked we can clear out the read list (unless this write was conditional).
+                # The assumption here is that you would never want to write out a register without
+                # subsequently reading it in some way prior to writing it again.
+                delete $reads{$dest} unless $instruct->{pred};
+            }
+
+            # Enforce instruction ordering where requested
+            if ($instruct->{order})
+            {
+                if ($orderedParent && $instruct->{order} > $orderedParent->{order})
+                {
+                    push @{$orderedParent->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                $orderedParent = $instruct;
+            }
+            elsif ($orderedParent)
+                {  $orderedParent = 0; }
+
+            # For a dest reg, push it onto the write stack
+            unshift @{$writes{$_}}, $instruct foreach @dest;
+
+            # For a src reg, push it into the read list
+            push @{$reads{$_}}, $instruct foreach @src;
+
+            # if this instruction has no dependencies it's ready to go
+            push @ready, $instruct if !exists $instruct->{parents};
+
+            $match = 1;
+            last;
+        }
+        die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match;
+    }
+    %writes = ();
+    %reads  = ();
+
+    if (@ready)
+    {
+        # update dependent counts for sorting hueristic
+        my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" };
+
+        countUniqueDescendants($readyParent, {});
+        updateDepCounts($readyParent, {});
+
+        # sort the initial ready list
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{dualCnt} <=> $b->{dualCnt}  ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall dualCnt mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    # Process the ready list, adding new instructions to the list as we go.
+    my $clock = 0;
+    while (my $instruct = shift @ready)
+    {
+        my $stall = $instruct->{stall};
+
+        # apply the stall to the previous instruction
+        if (@schedule && $stall < 16)
+        {
+            my $prev = $schedule[$#schedule];
+
+            $stall = $prev->{force_stall} if $prev->{force_stall} > $stall;
+
+            # if stall is greater than 4 then also yield
+            # the yield flag is required to get stall counts 12-15 working correctly.
+            $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0;
+            $prev->{ctrl} |= $stall;
+            $clock += $stall;
+        }
+        # For stalls bigger than 15 we assume the user is managing it with a barrier
+        else
+        {
+            $instruct->{ctrl} &= 0x1fff0;
+            $instruct->{ctrl} |= 1;
+            $clock += 1;
+        }
+        print "$clock: $instruct->{inst}\n" if $debug;
+
+        # add a new instruction to the schedule
+        push @schedule, $instruct;
+
+        # update each child with a new earliest execution time
+        if (my $children = $instruct->{children})
+        {
+            foreach (@$children)
+            {
+                my ($child, $latency) = @$_;
+
+                # update the earliest clock value this child can safely execute
+                my $earliest = $clock + $latency;
+                $child->{exeTime} = $earliest if $child->{exeTime} < $earliest;
+
+                print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug;
+
+                # decrement parent count and add to ready queue if none remaining.
+                push @ready, $child if --$child->{parents} < 1;
+            }
+            delete $instruct->{children};
+        }
+
+        # update stall and mix values in the ready queue on each iteration
+        foreach my $ready (@ready)
+        {
+            # calculate how many instructions this would cause the just added instruction to stall.
+            $stall = $ready->{exeTime} - $clock;
+            $stall = 1 if $stall < 1;
+
+            # if using the same compute resource as the prior instruction then limit the throughput
+            if ($ready->{class} eq $instruct->{class})
+            {
+                $stall = $ready->{tput} if $stall < $ready->{tput};
+            }
+            # dual issue with a simple instruction (tput <= 2)
+            # can't dual issue two instructions that both load a constant
+            elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} &&
+                   $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const}))
+            {
+                $stall = 0;
+            }
+            $ready->{stall} = $stall;
+
+            # add an instruction class mixing huristic that catches anything not handled by the stall
+            $ready->{mix} = $ready->{class} ne $instruct->{class} || 0;
+            $ready->{mix} = 2 if $ready->{mix} && $ready->{op} eq 'R2P';
+        }
+
+        # sort the ready list by stall time, mixing huristic, dependencies and line number
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $a->{stall}   <=> $b->{stall}  ||
+            $a->{dualCnt} <=> $b->{dualCnt}  ||
+            $b->{mix}     <=> $a->{mix}    ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "\tf,ext,stl,duc,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall dualCnt mix deps lineNum inst)} foreach @ready;
+        }
+
+        foreach my $ready (@ready)
+        {
+            $ready->{dualCnt} = 0 if $ready->{dualCnt} && $ready->{stall} == 1;
+        }
+    }
+
+    my $out;
+    #$out .= "$_\n" foreach @comments;
+    $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule;
+    return $out;
+}
+
+sub setConstMap
+{
+    my ($constMap, $constMapText) = @_;
+
+    foreach my $line (split "\n", $constMapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my ($name, $value) = split '\s*:\s*', $line;
+
+        $constMap->{$name} = $value;
+    }
+    return;
+}
+
+sub setRegisterMap
+{
+    my ($regMap, $regmapText) = @_;
+
+    my $vectors = $regMap->{__vectors} ||= {};
+    my $regBank = $regMap->{__regbank} ||= {};
+    my %aliases;
+
+    foreach my $line (split "\n", $regmapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my $auto  = $line =~ /~/;
+        my $share = $line =~ /=/;
+
+        my ($regNums, $regNames) = split '\s*[:~=]\s*', $line;
+
+        my (@numList, @nameList, %vecAliases);
+        foreach my $num (split '\s*,\s*', $regNums)
+        {
+            my ($start, $stop) = split '\s*\-\s*', $num;
+            die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop;
+            push @numList, ($start .. $stop||$start);
+        }
+        foreach my $fullName (split '\s*,\s*', $regNames)
+        {
+            if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$')
+            {
+                my ($name1, $name2, $bank) = ($1, $3, $4);
+                foreach (split '\s*\|\s*', $2)
+                {
+                    my ($start, $stop) = split '\s*\-\s*';
+                    foreach my $r (map "$name1$_$name2", $start .. $stop||$start)
+                    {
+                        # define an alias for use in vector instructions that omits the number portion
+                        $aliases{$r} = "$name1$name2" unless exists $aliases{$r};
+                        push @nameList, $r;
+                        $regBank->{$r} = $bank if $auto && defined $bank;
+                        warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank;
+                    }
+                }
+            }
+            elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$')
+            {
+                push @nameList, $1;
+                $regBank->{$1} = $2 if $auto && defined $2;
+                warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2;
+            }
+            else
+            {
+                die "Bad register name: '$fullName' at: $line\n";
+            }
+        }
+        die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList;
+        die "Missmatched register mapping at: $line\n" if $share && @numList > 1;
+
+        # detect if this list is monotonically ascending with no gaps
+        my $i = 0;
+        while ($i < $#numList-1)
+        {
+            last if $numList[$i] + 1 != $numList[$i+1];
+            $i++;
+        }
+        my $ascending = $i+1 == $#numList;
+
+        foreach my $n (0..$#nameList)
+        {
+            die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]};
+
+            if ($auto)
+            {
+                # assign possible values to be assigned on assembly
+                $regMap->{$nameList[$n]} = \@numList;
+            }
+            elsif ($share)
+            {
+                # each name shares the same single register
+                $regMap->{$nameList[$n]} = 'R' . $numList[0];
+            }
+            else
+            {
+                $regMap->{$nameList[$n]} = 'R' . $numList[$n];
+                # flag any even register as a potential vector
+                if ($ascending && ($numList[$n] & 1) == 0)
+                {
+                    # constrain potential range to vector alignment
+                    my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3);
+                    if ($end <= $#nameList)
+                    {
+                        $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ];
+                        #setup an alias for the base name without the number
+                        if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}})
+                        {
+                            $regMap->{$aliases{$nameList[$n]}}  = $regMap->{$nameList[$n]};
+                            $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]};
+                            delete $aliases{$nameList[$n]};
+                        }
+                    }
+                }
+            }
+        }
+    }
+    #print Dumper($regMap); exit(1);
+}
+
+sub preProcessLine
+{
+    # strip leading space
+    $_[0] =~ s|^\s+||;
+
+    # preserve comment but check for emptiness
+    my $val = shift;
+
+    # strip comments
+    $val =~ s{(?:#|//).*}{};
+
+    # skip blank lines
+    return $val =~ m'\S';
+}
+
+# traverse the graph and count total descendants per node.
+# only count unique nodes (by lineNum)
+sub countUniqueDescendants
+{
+    my ($node, $edges) = @_;
+
+    #print "P:$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+
+            $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges);
+        }
+        foreach my $child (grep !$_->[1], @$children) # WaR deps
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+
+            1 foreach countUniqueDescendants($child->[0], $edges);
+        }
+    }
+    else
+    {
+        return $node->{lineNum};
+    }
+    return ($node->{lineNum}, keys %{$node->{deps}});
+}
+# convert hash to count for easier sorting.
+sub updateDepCounts
+{
+    my ($node, $edges) = @_;
+
+    #warn "$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (@$children)
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+            updateDepCounts($child->[0], $edges);
+        }
+    }
+    $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0;
+}
+
+# Detect register bank conflicts and calculate reuse stats
+sub registerHealth
+{
+    my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_;
+
+    my (@banks, @conflicts);
+
+    foreach my $slot (qw(r8 r20 r39))
+    {
+        my $r = $capData->{$slot} or next;
+        next if $r eq 'RZ';
+
+        my $slotHist = $reuseHistory->{$slot} ||= {};
+
+        $reuseHistory->{total}++;
+
+        # if this register is in active reuse then ignore for bank conflict checking.
+        if (exists $slotHist->{$r})
+        {
+            $reuseHistory->{reuse}++;
+        }
+        else
+        {
+            # extract number from reg and take the modulo-4 value.  This is the bank id.
+            my $bank = substr($r,1) & 3;
+
+            # check for conflict
+            if ($banks[$bank] && $banks[$bank] ne $r)
+            {
+                push @conflicts, $banks[$bank] if !@conflicts;
+                push @conflicts, $r;
+
+                $reuseHistory->{conflicts}++;
+            }
+            $banks[$bank] = $r;
+        }
+
+        # update the history
+        if ($reuseFlags & $reuseSlots{$slot})
+            { $slotHist->{$r} = 1; }
+        else
+            { delete $slotHist->{$r};  }
+    }
+    if ($inst && @conflicts && !$nowarn)
+    {
+        printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts);
+    }
+    return scalar @conflicts;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+MaxAs::MaxAs - Assembler for NVIDIA Maxwell architecture
+
+=head1 SYNOPSIS
+
+    maxas.pl [opts]
+
+=head1 DESCRIPTION
+
+See the documentation at: https://github.com/NervanaSystems/maxas
+
+=head1 SEE ALSO
+
+See the documentation at: https://github.com/NervanaSystems/maxas
+
+
+=head1 AUTHOR
+
+Scott Gray, E<lt>sgray@nervanasys.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+=cut
diff --git a/Assembler/MaxAs/lib/MaxAs/MaxAsGrammar.pm b/Assembler/MaxAs/lib/MaxAs/MaxAsGrammar.pm
new file mode 100644
index 0000000..2298442
--- /dev/null
+++ b/Assembler/MaxAs/lib/MaxAs/MaxAsGrammar.pm
@@ -0,0 +1,1478 @@
+package MaxAs::MaxAsGrammar;
+
+use strict;
+use Carp;
+use Exporter;
+use Data::Dumper;
+our @ISA = qw(Exporter);
+
+our @EXPORT = qw(
+    %grammar %flags
+    parseInstruct genCode genReuseCode
+    processAsmLine processSassLine processSassCtrlLine
+    replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters
+);
+
+require 5.10.0;
+
+# Helper functions for operands
+sub getI
+{
+    my ($orig, $pos, $mask) = @_;
+    my $val = $orig;
+    my $neg = $val =~ s|^\-||;
+
+    # parse out our custom index immediates for addresses
+    if ($val  =~ m'^(\d+)[xX]<([^>]+)>')
+    {
+        # allow any perl expression and multiply result by leading decimal.
+        # also allow global scalar varibles in the expression.
+        my $mul = $1;
+        my $exp = $2;
+        # strip leading zeros (don't interpret numbers as octal)
+        $exp =~ s/(?<!\d)0+(?=[1-9])//g;
+        my @globals = $exp =~ m'\$\w+'g;
+        my $our = @globals ? ' our (' . join(',',@globals) . ');' : '';
+        $val = $mul * eval "package MaxAs::MaxAs::CODE;$our $exp";
+        #print "$val = $mul x $exp\n"; # if $our;
+    }
+    # hexidecial value
+    elsif ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # otherwise val is a simple decimal value that doesn't need to be modified
+
+    if ( $neg )
+    {
+        # if the mask removes the sign bit the "neg" flag adds it back on the code somewhere else
+        $val = -$val;
+        $val &= $mask;
+    }
+    if (($val & $mask) != $val)
+    {
+        die sprintf "Immediate value out of range(0x%x): 0x%x ($orig)\n", $mask, $val;
+    }
+    return $val << $pos;
+}
+sub getF
+{
+    my ($val, $pos, $type, $trunc) = @_;
+    # hexidecial value
+    if ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # support infinity
+    elsif ($val =~ m'INF'i)
+    {
+        $val = $trunc ? ($type eq 'f' ? 0x7f800 : 0x7ff00) : 0x7f800000;
+    }
+    else
+    {
+        $val = unpack(($type eq 'f' ? 'L' : 'Q'), pack $type, $val);
+
+        # strip off sign bit if truncating.  It will added elsewhere in the code by the flag capture.
+        $val = ($val >> $trunc) & 0x7ffff if $trunc;
+    }
+    return $val << $pos;
+}
+sub getR
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^R(\d+|Z)$' && $1 < 255)
+    {
+        $val = $1 eq 'Z' ? 0xff : $1;
+    }
+    else
+    {
+        die "Bad register name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getP
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^P(\d|T)$' && $1 < 7)
+    {
+        $val = $1 eq 'T' ? 7 : $1;
+    }
+    else
+    {
+        die "Bad predicate name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 }
+
+# Map operands into their value and position in the op code.
+my %operands =
+(
+    p0      => sub { getP($_[0], 0)  },
+    p3      => sub { getP($_[0], 3)  },
+    p12     => sub { getP($_[0], 12) },
+    p29     => sub { getP($_[0], 29) },
+    p39     => sub { getP($_[0], 39) },
+    p45     => sub { getP($_[0], 45) },
+    p48     => sub { getP($_[0], 48) },
+    p58     => sub { getP($_[0], 58) },
+    r0      => sub { getR($_[0], 0)  },
+    r8      => sub { getR($_[0], 8)  },
+    r20     => sub { getR($_[0], 20) },
+    r28     => sub { getR($_[0], 28) },
+    r39s20  => sub { getR($_[0], 39) },
+    r39     => sub { getR($_[0], 39) },
+    r39a    => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20
+    c20     => sub { getC($_[0])     },
+    c39     => sub { getC($_[0])     },
+    c34     => sub { hex($_[0]) << 34 },
+    c36     => sub { hex($_[0]) << 36 },
+    f20w32  => sub { getF($_[0], 20, 'f')        },
+    f20     => sub { getF($_[0], 20, 'f', 12)    },
+    d20     => sub { getF($_[0], 20, 'd', 44)    },
+    i8w4    => sub { getI($_[0], 8,  0xf)        },
+    i20     => sub { getI($_[0], 20, 0x7ffff)    },
+    i20w6   => sub { getI($_[0], 20, 0x3f)       },
+    i20w7   => sub { getI($_[0], 20, 0x7f)       },
+    i20w8   => sub { getI($_[0], 20, 0xff)       },
+    i20w12  => sub { getI($_[0], 20, 0xfff)      },
+    i20w24  => sub { getI($_[0], 20, 0xffffff)   },
+    i20w32  => sub { getI($_[0], 20, 0xffffffff) },
+    i31w4   => sub { getI($_[0], 31, 0xf)        },
+    i34w13  => sub { getI($_[0], 34, 0x1fff)     },
+    i36w20  => sub { getI($_[0], 36, 0xfffff)    },
+    i39w8   => sub { getI($_[0], 39, 0xff)       },
+    i28w8   => sub { getI($_[0], 28, 0xff)       },
+    i28w20  => sub { getI($_[0], 28, 0xfffff)    },
+    i48w8   => sub { getI($_[0], 48, 0xff)       },
+    i51w5   => sub { getI($_[0], 51, 0x1f)       },
+    i53w5   => sub { getI($_[0], 53, 0x1f)       },
+);
+
+# Rules for operands and their closely tied flags
+my $hex     = qr"0[xX][0-9a-fA-F]+";
+my $iAddr   = qr"\d+[xX]<[^>]+>";
+my $immed   = qr"$hex|$iAddr|\d+"o;
+my $reg     = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\
+my $p       = qr"P[0-6T]";
+my $noPred  = qr"(?<noPred>)";
+my $pred    = qr"\@(?<predNot>\!)?P(?<predNum>[0-6]) ";
+my $p0      = qr"(?<p0>$p)"o;
+my $p3      = qr"(?<p3>$p)"o;
+my $p12     = qr"(?<p12not>\!)?(?<p12>$p)"o;
+my $p29     = qr"(?<p29not>\!)?(?<p29>$p)"o;
+my $p39     = qr"(?<p39not>\!)?(?<p39>$p)"o;
+my $p45     = qr"(?<p45>$p)"o;
+my $p48     = qr"(?<p48>$p)"o;
+my $p58     = qr"(?<p58>$p)"o;
+my $r0      = qr"(?<r0>$reg)";
+my $r0cc    = qr"(?<r0>$reg)(?<CC>\.CC)?";
+my $r8      = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1|F32))?(?<reuse1>\.reuse)?";
+my $r20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse2>\.reuse)?";
+my $r28     = qr"(?<r28>$reg)";
+my $r39s20  = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r39s20>(?<r20>$reg))\|?(?:\.(?<r39part>H0|H1))?(?<reuse2>\.reuse)?";
+my $r39     = qr"(?<r39neg>\-)?(?<r39>$reg)(?:\.(?<r39part>H0|H1|H0_H0|H1_H1|F32))?(?<reuse3>\.reuse)?";
+my $r39a    = qr"(?<r39a>(?<r39>$reg))(?<reuse3>\.reuse)?";
+my $c20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3))?"o;
+my $c20x    = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20partx>H0|H1|B0|B1|B2|B3))?"o;
+my $c20s39  = qr"(?<r39neg>\-)?c\[(?<c34>$hex)\]\s*\[(?<c39>$hex)\]"o;
+my $f20w32  = qr"(?<f20w32>(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))";
+my $f20     = qr"(?<f20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $d20     = qr"(?<d20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $i8w4    = qr"(?<i8w4>$immed)"o;
+my $i20     = qr"(?<i20>(?<neg>\-)?$immed)(?<r20neg>\.NEG)?"o;
+my $i20w6   = qr"(?<i20w6>$immed)"o;
+my $i20w7   = qr"(?<i20w7>$immed)"o;
+my $i20w8   = qr"(?<i20w8>$immed)"o;
+my $i20w12  = qr"(?<i20w12>$immed)"o;
+my $i20w24  = qr"(?<i20w24>\-?$immed)"o;
+my $i20w32  = qr"(?<i20w32>\-?$immed)"o;
+my $i39w8   = qr"(?<i39w8>\-?$immed)"o;
+my $i28w8   = qr"(?<i28w8>$immed)"o;
+my $i28w20  = qr"(?<i28w20>\-?$immed)"o;
+my $i31w4   = qr"(?<i31w4>$immed)"o;
+my $i34w13  = qr"(?<i34w13>$immed)"o;
+my $i36w20  = qr"(?<i36w20>$immed)"o;
+my $i48w8   = qr"(?<i48w8>$immed)"o;
+my $i51w5   = qr"(?<i51w5>$immed)"o;
+my $i53w5   = qr"(?<i53w5>$immed)"o;
+my $ir20    = qr"$i20|$r20"o;
+my $cr20    = qr"$c20|$r20"o;
+my $icr20   = qr"$i20|$c20|$r20"o;
+my $fcr20   = qr"$f20|$c20|$r20"o;
+my $cr39    = qr"$c20s39|$r39"o;
+my $dr20    = qr"$d20|$r20"o;
+
+# Instruction specific rules for capturing various flags
+my $u32   = qr"(?<U32>\.U32)?";
+my $ftz   = qr"(?<FTZ>\.FTZ)?";
+my $fmz   = qr"(?<FMZ>\.FMZ)?";
+my $sat   = qr"(?<SAT>\.SAT)?";
+my $rnd   = qr"(?:\.(?<rnd>RN|RM|RP|RZ))?";
+my $round = qr"(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?";
+my $fcmp  = qr"(?<cmp>\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)";
+my $icmp  = qr"\.(?<cmp>LT|EQ|LE|GT|NE|GE)";
+my $bool  = qr"\.(?<bool>AND|OR|XOR|PASS_B)";
+my $bool2 = qr"\.(?<bool2>AND|OR|XOR)";
+my $func  = qr"\.(?<func>COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)";
+my $rro   = qr"\.(?<func>SINCOS|EX2)";
+my $add3  = qr"(?:\.(?<type>X|RS|LS))?";
+my $lopz  = qr"(?:\.(?<z>NZ|Z) $p48,|(?<noz>))"o;
+my $X     = qr"(?<X>\.X)?";
+my $tld   = qr"(?<NODEP>NODEP\.)?(?:(?<reuse1>T)|(?<reuse2>P))";
+my $chnls = qr"(?<chnls>R|RGBA)";
+my $sr    = qr"SR_(?<sr>\S+)";
+my $shf   = qr"(?<W>\.W)?(?:\.(?<type>U64|S64))?(?<HI>\.HI)?";
+my $xmad  = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<mode>MRG|PSL\.CLO|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $xmadc = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<modec>MRG|PSL\.CLO|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $vmad8 = qr"\.(?<sign1>[SU])(?<size1>8|16)\.(?<sign2>[SU])(?<size2>8|16)(?<PO>\.PO)?(?<SHR_7>\.SHR_7)?(?<SHR_15>\.SHR_15)?(?<SAT>\.SAT)?";
+my $vmad16= qr"\.(?<sign1>[SU])(?<size1>16)\.(?<sign2>[SU])(?<size2>16)";
+my $hilo  = qr"(?:\.(?<mode>XHI|XLO))?";
+my $vaddType = qr"(?:\.(?<UD>UD))?(?:\.(?<SD>SD))?(?:\.(?<sign1>[SU])(?<size1>8|16|32))?(?:\.(?<sign2>[SU])(?<size2>8|16|32))?";
+my $vaddMode = qr"(?:\.(?<mode>MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?";
+my $vmnmx = qr"(?:\.(?<MX>MX))?";
+my $x2x   = qr"\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)";
+my $prmt  = qr"(?:\.(?<mode>F4E|B4E|RC8|ECL|ECR|RC16))?";
+my $shfl  = qr"\.(?<mode>IDX|UP|DOWN|BFLY)";
+my $bar   = qr"\.(?<mode>SYNC|ARV|RED)(?:\.(?<red>POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?(<r20>)|(?<nor20>))(?(<red>), $p39|(?<nop39>))"o;
+my $b2r   = qr"\.RESULT $r0(?:, $p45|(?<nop45>))"o;
+my $dbar  = qr"(?<SB>SB0|SB1|SB2|SB3|SB4|SB5)";
+my $dbar2 = qr"(?<db5>5)?,?(?<db4>4)?,?(?<db3>3)?,?(?<db2>2)?,?(?<db1>1)?,?(?<db0>0)?";
+my $mbar  = qr"\.(?<mode>CTA|GL|SYS)";
+my $addr  = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i20w24)?\]"o;
+my $addr2 = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i28w20)?\]"o;
+my $ldc   = qr"c\[(?<c36>$hex)\]\s*$addr"o;
+my $atom  = qr"(?<E>\.E)?(?:\.(?<mode>ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?<type>|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)";
+my $vote  = qr"\.(?<mode>ALL|ANY|EQ)"o;
+my $memType  = qr"(?<type>\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)";
+my $memCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|CI|CS|CV|IL|WT))?";
+my $dptype = qr"(?:\.(?<type1>U32|S32))?(?:\.(?<type2>U32|S32))?";
+my $dpmode = qr"\.(?<mode>LO|HI)";
+my $hmode  = qr"(?:\.(?<mode>F32|MRG_H0|MRG_H1))?$ftz";
+
+# class: hardware resource that shares characteristics with types
+# lat  : pipeline depth where relevent, placeholder for memory ops
+# blat : barrier latency, typical fetch time for memory operations. Highly variable.
+# rlat : operand read latency for memory ops
+# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op.
+# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession.
+# dual : whether this instruction type can be dual issued
+# reuse: whether this instruction type accepts register reuse flags.
+
+# Some of these values are guesses and need to be updated from micro benchmarks.
+# We may need to split these classes up further.
+# @TODO(keren): what instructions are used by SFUs
+my $s2rT  = {class => 's2r',   lat => 2,   blat => 25,  rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0, units => 32};
+my $smemT = {class => 'mem',   lat => 6,   blat => 30,  rlat => 2, rhold => 20, tput => 1,   dual => 1, reuse => 0, units => 8, type => 'shared'};
+my $gmemT = {class => 'mem',   lat => 200,   blat => 200, rlat => 4, rhold => 20, tput => 1,   dual => 1, reuse => 0, units => 8, type => 'global'};
+my $x32T  = {class => 'x32',   lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 1, units => 32};
+my $x64T  = {class => 'x64',   lat => 2,   blat => 128, rlat => 0, rhold => 0,  tput => 128, dual => 0, reuse => 1, units => 16};
+my $shftT = {class => 'shift', lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1, units => 32};
+my $cmpT  = {class => 'cmp',   lat => 13,  blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1, units => 32};
+my $qtrT  = {class => 'qtr',   lat => 8,   blat => 0,   rlat => 4, rhold => 0,  tput => 1,   dual => 1, reuse => 0, units => 8};
+my $rroT  = {class => 'rro',   lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0, units => 8};
+my $voteT = {class => 'vote',  lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0, units => 32};
+
+
+# Create map of op names to rules
+our %grammar =
+(
+    #Floating Point Instructions
+    FADD     => [ { type => $x32T,  code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FADD32I  => [ { type => $x32T,  code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FCHK     => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o,                     } ], #Partial?
+    FCMP     => [ { type => $cmpT,  code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o,            } ],
+    FFMA     => [
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o,         },
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o,     },
+                ],
+    FMNMX    => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o,                } ],
+    FMUL     => [ { type => $x32T,  code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FMUL32I  => [ { type => $x32T,  code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FSET     => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o,       } ],
+    FSETP    => [ { type => $cmpT,  code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ],
+    MUFU     => [ { type => $qtrT,  code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o,                              } ],
+    RRO      => [ { type => $rroT,  code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o,                               } ],
+    DADD     => [ { type => $x64T,  code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o,                        } ],
+    DFMA     => [ { type => $x64T,  code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o,                  } ],
+    DMNMX    => [ { type => $cmpT,  code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o,                     } ],
+    DMUL     => [ { type => $x64T,  code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o,                        } ],
+    DSET     => [ { type => $cmpT,  code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o,            } ],
+    DSETP    => [ { type => $cmpT,  code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o,      } ],
+    FSWZADD  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o,                                    } ], #TODO
+
+    HADD2     => [ { type => $x32T,  code => 0x5d10000000000000, rule => qr"^$pred?HADD2$hmode$fmz$ftz$sat $r0, $r8, $r20;"o,               } ],
+    HMUL2     => [ { type => $x32T,  code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$hmode$fmz$ftz$sat $r0, $r8, $r20;"o,               } ],
+    HFMA2     => [ { type => $x32T,  code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$hmode$fmz$ftz$sat $r0, $r8, $r20, $r39;"o,         } ],
+    HSETP2    => [ { type => $cmpT,  code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial
+
+    #Integer Instructions
+    BFE       => [ { type => $shftT,  code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o,                          } ],
+    BFI       => [ { type => $shftT,  code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o,                        } ],
+    FLO       => [ { type => $s2rT,   code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o,                              } ],
+    IADD      => [ { type => $x32T,   code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o,                         } ],
+    IADD32I   => [ { type => $x32T,   code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o,                         } ],
+    IADD3     => [ { type => $x32T,   code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o,                 } ],
+    ICMP      => [ { type => $cmpT,   code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o,              } ],
+    IMNMX     => [ { type => $shftT,  code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o,                  } ],
+    ISET      => [ { type => $shftT,  code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o,       } ],
+    ISETP     => [ { type => $cmpT,   code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ],
+    ISCADD    => [ { type => $shftT,  code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o,                   } ],
+    ISCADD32I => [ { type => $shftT,  code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o,               } ],
+    LEA       => [
+                   { type => $cmpT,   code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o,                      },
+                   { type => $shftT,  code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o,                    },
+                   { type => $shftT,  code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o,          },
+                   { type => $shftT,  code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o,          },
+                 ],
+    LOP       => [ { type => $x32T,   code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?<INV>~)?$icr20(?<INV>\.INV)?;"o, } ],
+    LOP32I    => [ { type => $x32T,   code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o,                     } ],
+    LOP3      => [
+                   { type => $x32T,   code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o,            },
+                   { type => $x32T,   code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o,            },
+                 ],
+    POPC      => [ { type => $s2rT,   code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o,                                    } ],
+    SHF       => [
+                   { type => $shftT,  code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o,                  },
+                   { type => $shftT,  code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o,                  },
+                 ],
+    SHL       => [ { type => $shftT,  code => 0x5c48000000000000, rule => qr"^$pred?SHL(?<W>\.W)? $r0, $r8, $icr20;"o,                    } ],
+    SHR       => [ { type => $shftT,  code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o,                          } ],
+    XMAD      => [
+                   { type => $x32T,   code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o,                 },
+                   { type => $x32T,   code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o,            },
+                   { type => $x32T,   code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o,                  },
+                 ],
+    # XMAD replaces these
+    IMAD      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o,   } ], #TODO
+    IMADSP    => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO
+    IMUL      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o,   } ], #TODO
+
+    #Conversion Instructions
+    F2F => [ { type => $qtrT,  code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ],
+    F2I => [ { type => $qtrT,  code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o,         } ],
+    I2F => [ { type => $qtrT,  code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o,               } ],
+    I2I => [ { type => $qtrT,  code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o,               } ],
+
+    #Movement Instructions
+    MOV    => [ { type => $x32T,  code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o,                   } ],
+    MOV32I => [ { type => $x32T,  code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o,   } ],
+    PRMT   => [ { type => $x32T,  code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ],
+    SEL    => [ { type => $x32T,  code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o,        } ],
+    SHFL   => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ],
+
+    #Predicate/CC Instructions
+    PSET   => [ { type => $cmpT,  code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o,       } ],
+    PSETP  => [ { type => $cmpT,  code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ],
+    CSET   => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o,  } ], #TODO
+    CSETP  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO
+    P2R    => [ { type => $x32T,  code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o,   } ],
+    R2P    => [ { type => $shftT, code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o,   } ],
+
+    #Texture Instructions
+    # Handle the commonly used 1D texture functions.. but save the others for later
+    TLD    => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial
+    TLDS   => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial
+    TEX    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o,   } ], #TODO
+    TLD4   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o,  } ], #TODO
+    TXQ    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o,   } ], #TODO
+    TEXS   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o,  } ], #TODO
+    TLD4S  => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO
+
+    #Compute Load/Store Instructions
+    LD     => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o,      } ],
+    ST     => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o,      } ],
+    LDG    => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o,           } ],
+    STG    => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o,           } ],
+    LDS    => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o,           } ],
+    STS    => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o,           } ],
+    LDL    => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o,           } ],
+    STL    => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o,           } ],
+    LDC    => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o,            } ],
+    # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded).
+    ATOM   => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o,   } ],
+    ATOMS  => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o,  } ],
+    RED    => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o,                      } ],
+    CCTL   => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o,  } ], #TODO
+    CCTLL  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO
+    CCTLT  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO
+
+    #Surface Memory Instructions (haven't gotten to these yet..)
+    SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO
+    SULD   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o,   } ], #TODO
+    SURED  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o,  } ], #TODO
+    SUST   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o,   } ], #TODO
+
+    #Control Instructions
+    BRA    => [
+                { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?<U>\.U)? $i20w24;"o,         },
+                { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?<U>\.U)? CC\.EQ, $i20w24;"o, },
+              ],
+    BRX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o,                      } ], #TODO
+    JMP    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o,                      } ], #TODO
+    JMX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o,                      } ], #TODO
+    SSY    => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o,                 } ],
+    SYNC   => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o,                          } ],
+    CAL    => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o,                 } ],
+    JCAL   => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o,                } ],
+    PRET   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o,                     } ], #TODO
+    RET    => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o,                           } ],
+    BRK    => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o,                           } ],
+    PBK    => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o,                 } ],
+    CONT   => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o,                          } ],
+    PCNT   => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o,                } ],
+    EXIT   => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o,                          } ],
+    PEXIT  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o,                    } ], #TODO
+    BPT    => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o,           } ],
+
+    #Miscellaneous Instructions
+    NOP    => [ { type => $x32T,  code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o,                                     } ],
+    CS2R   => [ { type => $x32T,  code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o,                           } ],
+    S2R    => [ { type => $s2rT,  code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o,                            } ],
+    B2R    => [ { type => $x32T,  code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o,                                 } ],
+    BAR    => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o,                                 } ],
+    DEPBAR => [
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, },
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o,              },
+              ],
+    MEMBAR => [ { type => $x32T,  code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o,                             } ],
+    VOTE   => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?<nor0>))$p45, $p39;"o, } ],
+    R2B    => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o,                                } ], #TODO
+
+    #Video Instructions... Need to finish
+    VADD   => [   { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMAD   => [
+                  { type => $x32T,  code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, },
+                  { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, },
+              ],
+    VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMNMX    => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+
+    VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+
+    DP4A => [ { type => $x32T,   code => 0x53f8000000000000, rule => qr"^$pred?DP4A$dptype $r0, $r8, $icr20, $r39;"o, } ],
+    DP2A => [ { type => $x32T,   code => 0x53f9000000000000, rule => qr"^$pred?DP2A$dpmode$dptype $r0, $r8, $icr20, $r39;"o, } ],
+);
+
+# Create map of capture groups to op code flags that need to be added (or removed)
+my @flags = grep /\S/, split "\n", q{;
+
+BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD
+0x0100000000000000 neg
+
+FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP
+0x0100000000000000 neg
+
+PSET, PSETP
+0x0000000000008000 p12not
+0x0000000100000000 p29not
+
+FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE
+0x0000040000000000 p39not
+
+IADD, IADD3, XMAD, LEA, IMNMX
+0x0000800000000000 CC
+
+IADD32I
+0x0010000000000000 CC
+
+LEA
+0x0000000000000000 X
+
+SHF
+0x0004000000000000 W
+0x0001000000000000 HI
+
+SHF: type
+0x0000004000000000 U64
+0x0000006000000000 S64
+
+SHR, IMNMX, ISETP, ISET, ICMP, BFE
+0x0001000000000000 U32
+
+SHL
+0x0000008000000000 W
+
+SHFL
+0x0000000010000000 i20w8
+0x0000000020000000 i34w13
+
+SHFL: mode
+0x0000000000000000 IDX
+0x0000000040000000 UP
+0x0000000080000000 DOWN
+0x00000000c0000000 BFLY
+
+IMNMX: mode
+0x0000080000000000 XLO
+0x0000180000000000 XHI
+
+ISETP, ISET, ICMP: cmp
+0x0002000000000000 LT
+0x0004000000000000 EQ
+0x0006000000000000 LE
+0x0008000000000000 GT
+0x000a000000000000 NE
+0x000c000000000000 GE
+
+ISETP, ISET, PSETP, PSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+PSETP, PSET: bool2
+0x0000000000000000 AND
+0x0000000001000000 OR
+0x0000000002000000 XOR
+
+ISETP, ISET
+0x0000080000000000 X
+
+LOP: bool
+0x0000000000000000 AND
+0x0000020000000000 OR
+0x0000040000000000 XOR
+0x0000060000000000 PASS_B
+
+LOP:
+0x0000010000000000 INV
+
+LOP: z
+0x0000200000000000 Z
+0x0000300000000000 NZ
+
+LOP
+0x0007000000000000 noz
+
+LOP32I: bool
+0x0000000000000000 AND
+0x0020000000000000 OR
+0x0040000000000000 XOR
+
+PRMT: mode
+0x0001000000000000 F4E
+0x0002000000000000 B4E
+0x0003000000000000 RC8
+0x0004000000000000 ECL
+0x0005000000000000 ECR
+0x0006000000000000 RC16
+
+XMAD: type1
+0x0000000000000000 U16
+0x0001000000000000 S16
+
+XMAD: type2
+0x0000000000000000 U16
+0x0002000000000000 S16
+
+XMAD: mode
+0x0000002000000000 MRG
+0x0000001000000000 PSL
+0x0008000000000000 CHI
+0x0004000000000000 CLO
+0x000c000000000000 CSFU
+0x0004001000000000 PSL.CLO
+
+XMAD: modec
+0x0004000000000000 CLO
+0x0008000000000000 CHI
+0x000c000000000000 CSFU
+0x0040000000000000 X
+0x0080000000000000 PSL
+0x0100000000000000 MRG
+0x0084000000000000 PSL.CLO
+
+XMAD
+0x0010000000000000 CBCC
+
+XMAD: r8part
+0x0000000000000000 H0
+0x0020000000000000 H1
+
+XMAD: r20part
+0x0000000000000000 H0
+0x0000000800000000 H1
+
+XMAD: r20partx
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+XMAD: r39part
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part
+0x0000000000000000 B0
+0x0000001000000000 B1
+0x0000002000000000 B2
+0x0000003000000000 B3
+0x0000001000000000 H1
+0x0000000000000000 H0
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part
+0x0000000000000000 B0
+0x0000000010000000 B1
+0x0000000020000000 B2
+0x0000000030000000 B3
+0x0000000010000000 H1
+0x0000000000000000 H0
+
+VMAD
+0x0040000000000000 r8neg
+0x0020000000000000 r39neg
+0x0008000000000000 SHR_7
+0x0010000000000000 SHR_15
+0x0060000000000000 PO
+0x0080000000000000 SAT
+
+VMNMX
+0x0100000000000000 MX
+
+VADD, VABSDIFF, VMNMX
+0x0080000000000000 SAT
+0x0040000000000000 UD
+0x0040000000000000 SD
+
+VSET: cmp
+0x0040000000000000 LT
+0x0080000000000000 EQ
+0x00c0000000000000 LE
+0x0100000000000000 GT
+0x0140000000000000 NE
+0x0180000000000000 GE
+
+VADD, VSET: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VABSDIFF: mode
+0x0003000000000000 ACC
+0x000b000000000000 MIN
+0x0013000000000000 MAX
+0x0023000000000000 MRG_16H
+0x002b000000000000 MRG_16L
+0x0033000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x003b000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMNMX: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1
+0x0000000000000000 U
+0x0001000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2
+0x0000000000000000 U
+0x0002000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size1
+0x0000000000000000 8
+0x0000004000000000 16
+0x0000006000000000 32
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size2
+0x0000000000000000 8
+0x0000000040000000 16
+0x0000000060000000 32
+
+IADD3: type
+0x0001000000000000 X
+0x0000002000000000 RS
+0x0000004000000000 LS
+
+IADD3: r8part
+0x0000000000000000 H0
+0x0000001000000000 H1
+
+IADD3: r20part
+0x0000000080000000 H0
+
+IADD3: r39part
+0x0000000200000000 H0
+
+IADD3
+0x0008000000000000 r8neg
+0x0004000000000000 r20neg
+0x0002000000000000 r39neg
+
+IADD
+0x0000080000000000 X
+0x0004000000000000 SAT
+
+IADD, ISCADD
+0x0002000000000000 r8neg
+0x0001000000000000 r20neg
+
+IADD32I
+0x0100000000000000 r8neg
+0x0020000000000000 X
+
+DEPBAR: SB
+0x0000000000000000 SB0
+0x0000000004000000 SB1
+0x0000000008000000 SB2
+0x000000000c000000 SB3
+0x0000000010000000 SB4
+0x0000000014000000 SB5
+
+DEPBAR: cmp
+0x0000000020000000 LE
+
+DEPBAR
+0x0000000000000001 db0
+0x0000000000000002 db1
+0x0000000000000004 db2
+0x0000000000000008 db3
+0x0000000000000010 db4
+0x0000000000000020 db5
+
+F2F, F2I, I2F, I2I: destWidth
+0x0000000000000000 8
+0x0000000000000100 16
+0x0000000000000200 32
+0x0000000000000300 64
+
+F2F, F2I, I2F, I2I: srcWidth
+0x0000000000000000 8
+0x0000000000000400 16
+0x0000000000000800 32
+0x0000000000000c00 64
+
+F2F, F2I, I2F, I2I: destSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000001000 S
+
+F2F, F2I, I2F, I2I: srcSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000002000 S
+
+F2I, I2F, I2I: r20part
+0x0000000000000000 H0
+0x0000040000000000 H1
+0x0000000000000000 B0
+0x0000020000000000 B1
+0x0000040000000000 B2
+0x0000060000000000 B3
+
+F2F: r20part
+0x0000000000000000 H0
+0x0000020000000000 H1
+
+F2F: round
+0x0000040000000000 ROUND
+0x0000048000000000 FLOOR
+0x0000050000000000 CEIL
+0x0000058000000000 TRUNC
+
+F2I: round
+0x0000000000000000 ROUND
+0x0000008000000000 FLOOR
+0x0000010000000000 CEIL
+0x0000018000000000 TRUNC
+
+HADD2, HMUL2, HFMA2: r8part
+0x0001000000000000 H0_H0
+0x0001800000000000 H1_H1
+0x0000800000000000 F32
+
+HADD2, HMUL2, HFMA2: r20part
+0x0000000020000000 H0_H0
+0x0000000030000000 H1_H1
+
+HFMA2: r39part
+0x0000000800000000 F32
+0x0000001000000000 H0_H0
+0x0000001800000000 H1_H1
+
+HADD2, HMUL2, HFMA2
+0x0000000080000000 r20neg
+0x0000000040000000 r39neg
+
+HADD2, HMUL2, HFMA2: mode
+0x0002000000000000 F32
+0x0004000000000000 MRG_H0
+0x0006000000000000 MRG_H1
+
+HADD2, HMUL2
+0x0000008000000000 FTZ
+
+HFMA2
+0x0000002000000000 FTZ
+
+HFMA2
+0x0000004000000000 FMZ
+
+HADD2, HMUL2, HFMA2
+0x0000000100000000 SAT
+
+FADD, DADD, FMUL, DMUL, F2F, I2F: rnd
+0x0000000000000000 RN
+0x0000008000000000 RM
+0x0000010000000000 RP
+0x0000018000000000 RZ
+
+DFMA: rnd
+0x0000000000000000 RN
+0x0004000000000000 RM
+0x0008000000000000 RP
+0x000c000000000000 RZ
+
+FFMA: rnd
+0x0000000000000000 RN
+0x0008000000000000 RM
+0x0010000000000000 RP
+0x0018000000000000 RZ
+
+FFMA
+0x0020000000000000 FTZ
+
+F2F, F2I, FADD, FMUL, FMNMX
+0x0000100000000000 FTZ
+
+FADD32I
+0x0080000000000000 FTZ
+
+FMUL32I
+0x0020000000000000 FTZ
+
+FSET
+0x0080000000000000 FTZ
+
+FSETP, FCMP
+0x0000800000000000 FTZ
+
+FADD, FFMA, FMUL, F2F, I2I
+0x0004000000000000 SAT
+
+FADD, DADD, FMNMX, DMNMX, MUFU
+0x0001000000000000 r8neg
+
+FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I
+0x0000200000000000 r20neg
+
+FMUL, DMUL, FFMA, DFMA
+0x0001000000000000 r20neg
+
+FFMA, DFMA
+0x0002000000000000 r39neg
+
+FADD, DADD, FMNMX, DMNMX
+0x0000400000000000 r8abs
+
+FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I
+0x0002000000000000 r20abs
+
+FSETP, DSETP, FSET, DSET
+0x0000080000000000 r8neg
+0x0000000000000040 r20neg
+0x0000000000000080 r8abs
+0x0000100000000000 r20abs
+
+RRO: func
+0x0000000000000000 SINCOS
+0x0000008000000000 EX2
+
+MUFU: func
+0x0000000000000000 COS
+0x0000000000100000 SIN
+0x0000000000200000 EX2
+0x0000000000300000 LG2
+0x0000000000400000 RCP
+0x0000000000500000 RSQ
+0x0000000000600000 RCP64H
+0x0000000000700000 RSQ64H
+
+FSETP, DSETP, FSET, DSET, FCMP: cmp
+0x0001000000000000 .LT
+0x0002000000000000 .EQ
+0x0003000000000000 .LE
+0x0004000000000000 .GT
+0x0004000000000000
+0x0005000000000000 .NE
+0x0006000000000000 .GE
+0x0007000000000000 .NUM
+0x0008000000000000 .NAN
+0x0009000000000000 .LTU
+0x000a000000000000 .EQU
+0x000b000000000000 .LEU
+0x000c000000000000 .GTU
+0x000d000000000000 .NEU
+0x000e000000000000 .GEU
+
+FSETP, DSETP, FSET, DSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+HSETP2: cmp
+0x0000002800000000 .NE
+
+HSETP2: bool
+0x0000000000000000 AND
+
+S2R: sr
+0x0000000000000000 LANEID
+0x0000000000200000 VIRTCFG
+0x0000000000300000 VIRTID
+0x0000000002100000 TID.X
+0x0000000002200000 TID.Y
+0x0000000002300000 TID.Z
+0x0000000002500000 CTAID.X
+0x0000000002600000 CTAID.Y
+0x0000000002700000 CTAID.Z
+0x0000000003800000 EQMASK
+0x0000000003900000 LTMASK
+0x0000000003a00000 LEMASK
+0x0000000003b00000 GTMASK
+0x0000000003c00000 GEMASK
+
+CS2R: sr
+0x0000000005000000 CLOCKLO
+0x0000000005100000 CLOCKHI
+0x0000000005200000 GLOBALTIMERLO
+0x0000000005300000 GLOBALTIMERHI
+
+B2R
+0x0000e00000000000 nop45
+
+BAR
+0x0000100000000000 i8w4
+0x0000080000000000 nor20
+0x0000038000000000 nop39
+
+BAR: mode
+0x0000000000000000 SYNC
+0x0000000100000000 ARV
+0x0000000200000000 RED
+
+BAR: red
+0x0000000000000000 POPC
+0x0000000800000000 AND
+0x0000001000000000 OR
+
+MEMBAR: mode
+0x0000000000000000 CTA
+0x0000000000000100 GL
+0x0000000000000200 SYS
+
+VOTE: mode
+0x0000000000000000 ALL
+0x0001000000000000 ANY
+0x0002000000000000 EQ
+
+VOTE
+0x00000000000000ff nor0
+
+BRA
+0x0000000000000080 U
+
+TLDS: chnls
+0x0010000000000000 RGBA
+
+TLDS
+0x0002000000000000 NODEP
+
+LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS
+0x000000000000ff00 nor8
+
+LD, ST: type
+0x0000000000000000 .U8
+0x0020000000000000 .S8
+0x0040000000000000 .U16
+0x0060000000000000 .S16
+0x0080000000000000
+0x0080000000000000 .32
+0x00a0000000000000 .64
+0x00c0000000000000 .128
+
+LD, ST: cache
+0x0100000000000000 CG
+0x0200000000000000 CS
+0x0300000000000000 CV
+0x0300000000000000 WT
+
+LDG, STG, LDS, STS, LDL, STL, LDC: type
+0x0000000000000000 .U8
+0x0001000000000000 .S8
+0x0002000000000000 .U16
+0x0003000000000000 .S16
+0x0004000000000000
+0x0004000000000000 .32
+0x0005000000000000 .64
+0x0006000000000000 .128
+
+LDG, STG: cache
+0x0000400000000000 CG
+0x0000800000000000 CI
+0x0000800000000000 CS
+0x0000c00000000000 CV
+0x0000c00000000000 WT
+
+LDL: cache
+0x0000200000000000 CI
+
+LDC: cache
+0x0000100000000000 IL
+
+LDG, STG, LDS, STS, LDL, STL, LDC
+0x0000200000000000 E
+
+LDS
+0x0000100000000000 U
+
+RED: type
+0x0000000000000000
+0x0000000000100000 .S32
+0x0000000000200000 .U64
+0x0000000000300000 .F32.FTZ.RN
+0x0000000000400000 .F16x2.FTZ.RN
+0x0000000000500000 .S64
+
+RED: mode
+0x0000000000000000 ADD
+0x0000000000800000 MIN
+0x0000000001000000 MAX
+0x0000000001800000 INC
+0x0000000002000000 DEC
+0x0000000002800000 AND
+0x0000000003000000 OR
+0x0000000003800000 XOR
+
+ATOM: type
+0x0000000000000000
+0x0002000000000000 .S32
+0x0004000000000000 .U64
+0x0006000000000000 .F32.FTZ.RN
+0x0008000000000000 .F16x2.FTZ.RN
+0x000a000000000000 .S64
+0x0002000000000000 .64
+
+ATOM, RED
+0x0001000000000000 E
+
+ATOM: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x03f0000000000000 CAS
+
+ATOMS: type
+0x0000000000000000
+0x0000000010000000 .S32
+0x0000000020000000 .U64
+0x0000000030000000 .S64
+0x0010000000000000 .64
+
+ATOMS: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x0240000000000000 CAS
+
+DP4A, DP2A: type1
+0x0000000000000000 U32
+0x0002000000000000 S32
+
+DP4A, DP2A: type2
+0x0000000000000000 U32
+0x0000800000000000 S32
+
+DP2A: mode
+0x0000000000000000 LO
+0x0004000000000000 HI
+};
+
+# The existence of a capture group can map directly to an op code adjustment, or...
+# The named capture group value can map the op code adjustmemt from among several options
+our %flags;
+my (@ops, $flag);
+foreach my $line (@flags)
+{
+    if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)')
+    {
+        my $val = hex($1);
+        # named rules (op: name)
+        if ($flag)
+            { $flags{$_}{$flag}{$2} = $val foreach @ops; }
+        # simple existence check rules
+        else
+            { $flags{$_}{$2}        = $val foreach @ops; }
+    }
+    else
+    {
+        my ($ops, $name) = split ':\s*', $line;
+        @ops = split ',\s*', $ops;
+        $flag = $name;
+    }
+}
+
+sub parseInstruct
+{
+    my ($inst, $grammar) = @_;
+    return unless $inst =~ $grammar->{rule};
+    my %capData = %+;
+    return \%capData;
+}
+
+# for immediate or constant operands and a given opcode, bits 56-63 get transformed
+my %immedOps = map { $_ => 1 } qw(i20 f20 d20);
+my %immedCodes =
+(
+    0x5c => 0x64,
+    0x5b => 0x6d,
+    0x59 => 0x6b,
+    0x58 => 0x68,
+);
+my %constCodes =
+(
+    c20 => 0x10,
+    c39 => 0x08,
+);
+my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4);
+
+# just pick out the reuse code and nothing else
+sub genReuseCode
+{
+    my $capData = shift;
+    my $reuse = 0;
+    $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes;
+    return $reuse;
+}
+
+# Generate an op code from regex capture data
+# if you pass in a test array ref it will populate it with the matching capture groups
+sub genCode
+{
+    my ($op, $grammar, $capData, $test) = @_;
+
+    my $flags     = $flags{$op};
+    my $code      = $grammar->{code};
+    my $reuse     = 0;
+    my $immedCode = $immedCodes{$code >> 56};
+
+    #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I';
+
+    # process the instruction predicate (if valid for this instuction)
+    if (exists $capData->{noPred})
+    {
+        delete $capData->{noPred};
+        push @$test, 'noPred' if $test;
+    }
+    else
+    {
+        my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7;
+        push @$test, 'predNum' if $test;
+        if (exists $capData->{predNot})
+        {
+            $p |= 8;
+            push @$test, 'predNot' if $test;
+        }
+        $code ^= $p << 16;
+        delete @{$capData}{qw(predNum predNot)};
+
+    }
+    # process the register reuse flags
+    foreach my $rcode (qw(reuse1 reuse2 reuse3))
+    {
+        if (delete $capData->{$rcode})
+        {
+            $reuse |= $reuseCodes{$rcode};
+            push @$test, $rcode if $test;
+        }
+    }
+
+    foreach my $capture (keys %$capData)
+    {
+        # change the base code for immediate versions of the op
+        if (exists $immedOps{$capture})
+            { $code ^= $immedCode << 56; }
+        # change the base code for constant versions of the op
+        elsif (exists $constCodes{$capture})
+            { $code ^= $constCodes{$capture} << 56; }
+
+        # if capture group is an operand then process and add that data to code
+        if (exists $operands{$capture})
+        {
+            # don't process the r20 that comes with the r39s20 capture
+            unless ($capture eq 'r20' && exists $capData->{r39s20})
+            {
+                $code ^= $operands{$capture}->($capData->{$capture});
+                push @$test, $capture if $test;
+            }
+        }
+
+        # Add matching flags (an operand might also add/remove a flag)
+        if (exists $flags->{$capture})
+        {
+            # a named multivalue flag
+            if (ref $flags->{$capture})
+            {
+                $code ^= $flags->{$capture}{$capData->{$capture}};
+                push @$test, "$capture:$capData->{$capture}" if $test;
+            }
+            # a simple exists flag
+            else
+            {
+                $code ^= $flags->{$capture};
+                push @$test, $capture if $test;
+            }
+        }
+        elsif (!exists $operands{$capture} && !$test)
+        {
+            # Every capture group should be acted upon.  Missing one is a bug.
+            warn "UNUSED: $op: $capture: $capData->{$capture}\n";
+            warn Dumper($flags);
+        }
+    }
+
+    return $code, $reuse;
+}
+
+
+my $CtrlRe = qr'(?<ctrl>[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])';
+my $PredRe = qr'(?<pred>@!?(?<predReg>P\d)\s+)';
+my $InstRe = qr"$PredRe?(?<op>\w+)(?<rest>[^;]*;)"o;
+my $CommRe = qr'(?<comment>.*)';
+
+sub processAsmLine
+{
+    my ($line, $lineNum) = @_;
+
+    if ($line =~ m"^$CtrlRe(?<space>\s+)$InstRe$CommRe"o)
+    {
+        return {
+            lineNum => $lineNum,
+            pred    => $+{pred},
+            predReg => $+{predReg},
+            space   => $+{space},
+            op      => $+{op},
+            comment => $+{comment},
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            ctrl    => readCtrl($+{ctrl}, $line),
+        };
+    }
+    return undef;
+}
+
+sub processSassLine
+{
+    my $line = shift;
+
+    if ($line =~ m"^\s+/\*(?<num>[0-9a-f]+)\*/\s+$InstRe\s+/\* (?<code>0x[0-9a-f]+)"o)
+    {
+        return {
+            num     => hex($+{num}),
+            pred    => $+{pred},
+            op      => $+{op},
+            ins     => normalizeSpacing($+{op} . $+{rest}),
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            code    => hex($+{code}),
+        };
+    }
+    return undef;
+}
+
+sub processSassCtrlLine
+{
+    my ($line, $ctrl, $ruse) = @_;
+
+    return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)';
+
+    my $code = hex($1);
+    if (ref $ctrl)
+    {
+        push @$ctrl, ($code & 0x000000000001ffff) >> 0;
+        push @$ctrl, ($code & 0x0000003fffe00000) >> 21;
+        push @$ctrl, ($code & 0x07fffc0000000000) >> 42;
+    }
+    if (ref $ruse)
+    {
+        push @$ruse, ($code & 0x00000000001e0000) >> 17;
+        push @$ruse, ($code & 0x000003c000000000) >> 38;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+    }
+    return 1;
+}
+
+sub replaceXMADs
+{
+    my $file = shift;
+
+# XMAD.LO d, a, b, c, x;
+# ----------------------
+# XMAD.MRG x, a, b.H1, RZ;
+# XMAD d, a, b, c;
+# XMAD.PSL.CBCC d, a.H1, x.H1, d;
+# ----------------------
+# XMAD d, a, 0xffff, c;
+# XMAD.PSL d, a.H1, 0xffff, d;
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD\.LO\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<x>\w+)\s*;$CommRe/
+
+        die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s
+%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s;
+%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;',
+                @+{qw(ctrl space pred d a b c x comment)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>-?$immed|\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/
+
+        die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<c>\w+)\s*;$CommRe/
+
+        die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    #TODO: add more XMAD macros
+    return $file;
+}
+# convert extra spaces to single spacing to make our re's simplier
+sub normalizeSpacing
+{
+    my $inst = shift;
+    $inst =~ s/\t/ /g;
+    $inst =~ s/\s{2,}/ /g;
+    return $inst;
+}
+
+
+# map binary control notation on to easier to work with format.
+sub printCtrl
+{
+    my $code = shift;
+
+    my $stall = ($code & 0x0000f) >> 0;
+    my $yield = ($code & 0x00010) >> 4;
+    my $wrtdb = ($code & 0x000e0) >> 5;  # write dependency barier
+    my $readb = ($code & 0x00700) >> 8;  # read  dependency barier
+    my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier
+
+    $yield = $yield ? '-' : 'Y';
+    $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1;
+    $readb = $readb == 7 ? '-' : $readb + 1;
+    $watdb = $watdb ? sprintf('%02x', $watdb) : '--';
+
+    return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall;
+}
+sub readCtrl
+{
+    my ($ctrl, $context) = @_;
+    my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl;
+
+    $watdb = $watdb eq '--' ? 0 : hex $watdb;
+    $readb = $readb eq '-'  ? 7 : $readb - 1;
+    $wrtdb = $wrtdb eq '-'  ? 7 : $wrtdb - 1;
+    $yield = $yield eq 'y' || $yield eq 'Y'  ? 0 : 1;
+    $stall = hex $stall;
+
+    die sprintf('wait dep out of range(0x00-0x3f): %x at %s',   $watdb, $context) if $watdb != ($watdb & 0x3f);
+
+    return
+        $watdb << 11 |
+        $readb << 8  |
+        $wrtdb << 5  |
+        $yield << 4  |
+        $stall << 0;
+}
+
+sub getRegNum
+{
+    my ($regMap, $regName) = @_;
+
+    return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName};
+}
+
+sub getVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r0} or return;
+
+    return if $regName eq 'RZ';
+
+    if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+3);
+        }
+        confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4;
+        return @{$vectors->{$regName}};
+    }
+    return $regName;
+}
+
+sub getAddrVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r8} or return;
+
+    return if $regName eq 'RZ';
+
+    if (exists $capData->{E})
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        print Dumper($vectors) unless exists $vectors->{$regName};
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    return $regName;
+}
+
+__END__
+
+
+
diff --git a/Assembler/MaxAs/microbench/microbench.cpp b/Assembler/MaxAs/microbench/microbench.cpp
new file mode 100644
index 0000000..7b0187a
--- /dev/null
+++ b/Assembler/MaxAs/microbench/microbench.cpp
@@ -0,0 +1,212 @@
+// microbench.cpp : Defines the entry point for the console application.
+//
+
+// nvcc -l cuda -o microbench microbench.cpp
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cuda.h>
+#include <cudaProfiler.h>
+
+CUcontext hContext = 0;
+
+#define CUDA_CHECK( fn ) do { \
+		CUresult status = (fn); \
+		if ( CUDA_SUCCESS != status ) { \
+			const char* errstr; \
+			cuGetErrorString(status, &errstr); \
+			printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \
+			if (hContext) cuCtxDestroy(hContext); \
+			exit(EXIT_FAILURE); \
+		} \
+	} while (0)
+
+
+int main(int argc, char* argv[])
+{
+	//int iTest = 2896;
+	//while (iTest < 0x7fff)
+	//{
+	//	int iResult = iTest * iTest;
+	//	float fTest = (float)iTest;
+	//	int fResult = (int)(fTest * fTest);
+
+	//	printf("i*i:%08x f*f:%08x\n", iResult, fResult);
+
+	//	iTest += 0x0800;
+	//}
+	//exit(0);
+
+	char deviceName[32];
+	int devCount, ordinal, major, minor;
+	CUdevice  hDevice;
+
+	// Initialize the Driver API and find a device
+	CUDA_CHECK( cuInit(0) );
+	CUDA_CHECK( cuDeviceGetCount(&devCount) );
+	for (ordinal = 0; ordinal < devCount; ordinal++)
+	{
+		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
+		CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) );
+		CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) );
+		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
+		if (major >= 5 && minor >= 2)
+		{
+			printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
+			break;
+		}
+	}
+	if (ordinal == devCount)
+	{
+		printf("No compute 5.0 device found, exiting.\n");
+		exit(EXIT_FAILURE);
+	}
+
+	// First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing
+	int internalTiming = 1;
+	if (argc > 1)
+		internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0;
+
+	// Second command line arg is the number of blocks
+	int blocks = 1;
+	if (argc > 2)
+		blocks = atoi(argv[2]);
+	if (blocks < 1)
+		blocks = 1;
+
+	// Third command line arg is the number of threads
+	int threads = 128;
+	if (argc > 3)
+		threads = atoi(argv[3]);
+	if (threads > 1024 || threads < 32)
+		threads = 128;
+	threads &= -32;
+
+	// Forth command line arg:
+	double fops = 1.0;
+	int lanes = 1;
+	if (argc > 4)
+	{
+		if (internalTiming)
+		{
+			// The number of lanes to print for each warp
+			lanes = atoi(argv[4]);
+			if (lanes > 32 || lanes < 1)
+				lanes = 1;
+		}
+		else
+			// The number of floating point operations in a full kernel launch
+			fops = atof(argv[4]);
+	}
+
+	// Fifth command line arg is the repeat count for benchmarking
+	int repeat = 1;
+	if (argc > 5)
+		repeat = atoi(argv[5]);
+	if (repeat > 1000 || repeat < 1)
+		repeat = 1;
+
+	// threads = total number of threads
+	size_t size = sizeof(int) * threads * blocks;
+
+	// Setup our input and output buffers
+	int* dataIn  = (int*)malloc(size);
+	int* dataOut = (int*)malloc(size);
+	int* clocks  = (int*)malloc(size);
+	memset(dataIn, 0, size);
+
+	CUmodule hModule;
+	CUfunction hKernel;
+	CUevent hStart, hStop;
+	CUdeviceptr devIn, devOut, devClocks;
+
+	// Init our context and device memory buffers
+	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
+	CUDA_CHECK( cuMemAlloc(&devIn, size) );
+	CUDA_CHECK( cuMemAlloc(&devOut, size) );
+	CUDA_CHECK( cuMemAlloc(&devClocks, size) );
+	CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) );
+	CUDA_CHECK( cuMemsetD8(devOut, 0, size) );
+	CUDA_CHECK( cuMemsetD8(devClocks, 0, size) );
+
+	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) );
+	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );
+
+	// Load our kernel
+	CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") );
+	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") );
+
+	// Setup the params
+	void* params[] = { &devOut, &devClocks, &devIn };
+	float ms = 0;
+
+	// Warm up the clock (unless under nsight)
+	if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER
+		for (int i = 0; i < repeat; i++)
+			CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
+
+	// Launch the kernel
+	CUDA_CHECK( cuEventRecord(hStart, NULL) );
+	//CUDA_CHECK( cuProfilerStart() );
+	CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
+	//CUDA_CHECK( cuProfilerStop() );
+	CUDA_CHECK( cuEventRecord(hStop, NULL) );
+	CUDA_CHECK( cuEventSynchronize(hStop) );
+	CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
+
+	//CUDA_CHECK( cuCtxSynchronize() );
+
+	// Get back our results from each kernel
+	CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) );
+	CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) );
+
+	// Cleanup and shutdown of cuda
+	CUDA_CHECK( cuEventDestroy(hStart) );
+	CUDA_CHECK( cuEventDestroy(hStop) );
+	CUDA_CHECK( cuModuleUnload(hModule) );
+	CUDA_CHECK( cuMemFree(devIn) );
+	CUDA_CHECK( cuMemFree(devOut) );
+	CUDA_CHECK( cuMemFree(devClocks) );
+	CUDA_CHECK( cuCtxDestroy(hContext) );
+	hContext = 0;
+
+	// When using just one block, print out the internal timing data
+	if (internalTiming)
+	{
+		int count = 0, total = 0, min = 999999, max = 0;
+
+		int* clocks_p  = clocks;
+		int* dataOut_p = dataOut;
+
+		// Loop over and print results
+		for (int blk = 0; blk < blocks; blk++)
+		{
+			float *fDataOut = reinterpret_cast<float*>(dataOut_p);
+
+			for(int tid = 0; tid < threads; tid += 32)
+			{
+				// Sometimes we want data on each thread, sometimes just one sample per warp is fine
+				for (int lane = 0; lane < lanes; lane++)
+					printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u
+
+				count++;
+				total += clocks_p[tid];
+				if (clocks_p[tid] < min) min = clocks_p[tid];
+				if (clocks_p[tid] > max) max = clocks_p[tid];
+			}
+			clocks_p  += threads;
+			dataOut_p += threads;
+		}
+		printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max);
+	}
+	else
+	{
+		// For more than one block we're testing throughput and want external timing data
+		printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0));
+	}
+	// And free up host memory
+	free(dataIn); free(dataOut); free(clocks);
+
+	return 0;
+}
diff --git a/Assembler/MaxAs/microbench/microbench.cu b/Assembler/MaxAs/microbench/microbench.cu
new file mode 100644
index 0000000..7d4cd8f
--- /dev/null
+++ b/Assembler/MaxAs/microbench/microbench.cu
@@ -0,0 +1,69 @@
+
+// Note this file isn't configured to automatically compile
+
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+
+// Build:
+// nvcc -l cuda -o microbench microbench.cpp
+// nvcc -arch sm_50 -cubin microbench.cu
+
+// Inspect a cubin (use nvdisasm from cuda 6.5 for best results):
+// maxas.pl -e microbench.cubin
+
+// Insert new sass into cubin
+// maxas.pl -i microbench.sass microbench.cubin
+
+// run it:
+// ./microbench
+
+// Use extern C so C++ doesn't mangle our kernel name
+extern "C" __global__ void  microbench(int *out, int *clocks, int *in)
+{
+    __shared__ int share[1024];
+
+    int tid = threadIdx.x;
+    int bx  = blockIdx.x;
+    int by  = blockIdx.y;
+
+    int start = clock();
+
+    share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ
+
+    __syncthreads();
+
+    int end = clock();
+
+    clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start;
+
+    out[tid] = share[tid ^ 1];
+}
+
+// A note about using the Cuda Runtime.
+// If that's your preference over the driver API then here's what you'd do:
+
+// In your project properties in the Cuda C/C++ panel:
+//    -Set the "Keep Processed Files" (-keep) option
+//    -Add a -v manually to the command line
+// If compiling on command line just add -keep -v options to nvcc.
+// Rebuild your solution and look in the log for these lines that follow the ptxas step:
+
+// #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
+// #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
+// #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
+
+// You just need to manually run these 3 commands (or add them to a build script)
+// after you've modified the cubin generated from the preceeding ptxas command.
+// That will give you a new .cu.obj file which will automatically be linked in for you next time you
+// build your project (or you could manually run the linker step as well).
+
+// Having done that you can call your kernel normally using the <<< >>> syntax.
+// Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
+// With fatbin you can also keep non-maxwell optimized versions of your code.
+
+
+// I just discovered this also works as a shortcut to the above:
+// nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu
+
+// The cu kernel definitions above need to have empty bodies.
+// And, the cu file must be compiled to a lib seperately before linking.
\ No newline at end of file
diff --git a/Assembler/MaxAs/microbench/microbench.sass b/Assembler/MaxAs/microbench/microbench.sass
new file mode 100644
index 0000000..609274a
--- /dev/null
+++ b/Assembler/MaxAs/microbench/microbench.sass
@@ -0,0 +1,72 @@
+# Kernel: microbench
+
+// This is a simple micro bench to demonstrate the latency in loading SR_TID.X
+
+<CONSTANT_MAPPING>
+    blockDimX : c[0x0][0x08]
+    blockDimY : c[0x0][0x0c]
+    blockDimZ : c[0x0][0x10]
+    gridDimX  : c[0x0][0x14]
+    gridDimY  : c[0x0][0x18]
+    gridDimZ  : c[0x0][0x1c]
+
+    param_out[0]    : c[0x0][0x140]
+    param_out[1]    : c[0x0][0x144]
+    param_clocks[0] : c[0x0][0x148]
+    param_clocks[1] : c[0x0][0x14c]
+    param_in[0]     : c[0x0][0x150]
+    param_in[1]     : c[0x0][0x154]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     0-1 : out<0-1>
+     2-3 : clocks<0-1>
+     4-5 : in<0-1>
+    6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x
+
+</REGISTER_MAPPING>
+
+// Load in our params (not currently used below)
+--:-:-:-:1      MOV in0, param_in[0];
+--:-:-:-:1      MOV in1, param_in[1];
+
+// Get the first clock value
+--:-:-:-:1      CS2R clock1, SR_CLOCKLO;
+
+// Get the threadId and blockId
+// Set the Read-After-Write dependency barrier 1 and 2
+--:-:1:-:1      S2R tid, SR_TID.X;
+// Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it
+--:-:2:-:2      S2R bid, SR_CTAID.X;
+
+
+// Get the second clock value
+// Wait on the depenedency barriers that were set in the prior instruction
+// Stall 6 to allow CS2R time to complete before next instruction
+// CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks
+// This stall count does not factor into the time calculation at all
+03:-:-:-:6      CS2R clock2, SR_CLOCKLO;
+
+// Take the difference of clocks
+--:-:-:-:1      IADD clock1, clock2, -clock1;
+
+// Setup our output addresses
+// Stall your pipeline dependencies properly
+// Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code
+--:-:-:-:6      XMAD offset, bid, blockDimX, tid;
+
+// LEA is "load effective address"
+// The offset param is shifted left 2 and added to the pointers with 64bit math
+--:-:-:-:6      LEA      clocks0.CC, offset, param_clocks[0],     2;
+--:-:-:-:1      LEA.HI.X clocks1,    offset, param_clocks[1], RZ, 2;
+
+--:-:-:-:6      LEA      out0.CC, offset, param_out[0],     2;
+--:-:-:-:1      LEA.HI.X out1,    offset, param_out[1], RZ, 2;
+
+// Output the results.
+// No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
+--:-:-:-:1      STG.E [clocks], clock1;
+--:-:-:-:1      STG.E [out],    offset; # use this to return whatever you like to inspect the results
+--:-:-:-:5      EXIT;
+
diff --git a/Assembler/MaxAs/microbench/shared.pl b/Assembler/MaxAs/microbench/shared.pl
new file mode 100755
index 0000000..f760664
--- /dev/null
+++ b/Assembler/MaxAs/microbench/shared.pl
@@ -0,0 +1,12 @@
+#!/usr/bin/perl
+use strict;
+
+print `maxas.pl -i shared_sts16.sass microbench.cubin`;
+
+exit if $?;
+
+print `Release\\microbench.exe i 1 64`;
+
+
+__END__
+
diff --git a/Assembler/MaxAs/microbench/shared_lds.sass b/Assembler/MaxAs/microbench/shared_lds.sass
new file mode 100644
index 0000000..5f31dcf
--- /dev/null
+++ b/Assembler/MaxAs/microbench/shared_lds.sass
@@ -0,0 +1,122 @@
+# Kernel: microbench
+# InsCnt: 18
+# RegCnt: 5
+# SharedSize: 4096
+# BarCnt: 1
+# Params(3):
+#   ord:addr:size:align
+#   0:0x140:4:0
+#   1:0x144:4:0
+#   2:0x148:4:0
+
+// This is a simple micro bench to demonstrate the latency in loading SR_TID.X
+
+<REGISTER_MAPPING>
+
+    0-3 : result, a, b, c
+
+    4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20>
+
+</REGISTER_MAPPING>
+
+// Load in our params
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R bid,      SR_CTAID.X;
+
+--:-:-:-:1      MOV result,  c[0x0][0x0];
+--:-:-:-:1      MOV in,      c[0x0][0x100];
+
+--:-:-:-:1      CS2R clock1, SR_CLOCKLO;
+--:-:-:-:1      MOV result,  c[0x0][0x13c];
+--:-:-:-:1      CS2R clock2, SR_CLOCKLO;
+
+--:-:-:-:1      MOV blockDim, c[0x0][0x8];
+--:-:-:-:1      MOV out,      c[0x0][0x140];
+--:-:-:-:1      MOV clocks,   c[0x0][0x144];
+
+
+
+
+<SCHEDULE_BLOCK>
+
+03:-:-:-:1      LOP.AND tid3,   tid, 3;
+--:-:-:-:1      LOP.AND tid7,   tid, 7;
+--:-:-:-:1      LOP.AND tid96,  tid, 96;
+--:-:-:-:1      LOP.AND tid128, tid, 128;
+
+// readAs = ((tid128 >> 4) | tid7) << 4
+--:-:-:-:1      SHR.U32 readAs, tid128, 4;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid7;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = ((tid96 >> 3) | tid3) << 4
+--:-:-:-:1      SHR.U32 readBs, tid96, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid3;
+#--:-:-:-:1      SHL     readBs, readBs, 4;
+#--:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
+
+
+</SCHEDULE_BLOCK>
+
+
+
+#--:-:-:-:1      LDS.U.128 result, [readBs];
+
+
+
+
+01:-:-:-:1      IADD clock1, clock2, -clock1;
+
+
+--:-:-:-:1      XMAD tid, blockDim, bid, tid;
+--:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
+--:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
+--:-:-:Y:6      SHL  tid, tid, 0x2;
+
+--:-:-:-:1      IADD clocks, clocks, tid;
+--:-:-:-:2      IADD out,  out,  tid;
+
+--:-:-:-:1      STG [clocks], clock1;
+--:-:-:-:1      STG [out],    readBs;
+--:-:-:-:5      EXIT;
+
+<COMMENT>
+
+--:-:-:-:4      LOP.AND tid32, tid, -32;
+
+--:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
+
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+
+
+// readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
+--:-:-:-:1      LOP.AND readAs, tid,    0x80;
+--:-:-:-:1      SHR.U32 readAs, readAs, 4;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid7;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid1,   tid,    0x1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
+
+
+</COMMENT>
\ No newline at end of file
diff --git a/Assembler/MaxAs/microbench/shared_sts16.sass b/Assembler/MaxAs/microbench/shared_sts16.sass
new file mode 100644
index 0000000..2f6eb39
--- /dev/null
+++ b/Assembler/MaxAs/microbench/shared_sts16.sass
@@ -0,0 +1,116 @@
+# Kernel: microbench
+# InsCnt: 18
+# RegCnt: 5
+# SharedSize: 4096
+# BarCnt: 1
+# Params(3):
+#   ord:addr:size:align
+#   0:0x140:4:0
+#   1:0x144:4:0
+#   2:0x148:4:0
+
+// This is a simple micro bench to demonstrate the latency in loading SR_TID.X
+
+<REGISTER_MAPPING>
+
+    0-3 : result, a, b, c
+
+    4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20>
+
+</REGISTER_MAPPING>
+
+// Load in our params
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R bid,      SR_CTAID.X;
+
+//--:-:-:-:1      MOV result,  c[0x0][0x0];
+//--:-:-:-:1      MOV in,      c[0x0][0x100];
+--:-:-:-:1      MOV result, 1;
+
+--:-:-:-:1      MOV blockDim, c[0x0][0x8];
+--:-:-:-:1      MOV out,      c[0x0][0x140];
+--:-:-:-:1      MOV clocks,   c[0x0][0x144];
+
+
+// readAs = ((tid >> 1) & 7) << 4;
+03:-:-:-:6      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:6      SHL     readAs, readAs, 3;
+
+// readBs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024;
+--:-:-:-:6      LOP.AND tid1,   tid,    1;
+--:-:-:-:6      LOP.AND readBs, tid,    0x30;
+--:-:-:-:6      SHR.U32 readBs, readBs, 3;
+--:-:-:-:6      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:6      ISCADD  readBs, readBs, 0, 3;
+
+
+
+///--:-:-:-:1      STS [tid32], result;
+//--:-:-:-:1      STS.S16 [tid32 + 2x<32>], result;
+//--:-:1:-:2      LDS.U.64 result, [readBs];
+
+--:-:-:-:0      CS2R clock1, SR_CLOCKLO;
+--:-:1:-:6      LDS.U.64 result, [readAs];
+--:-:-:-:6      CS2R clock2, SR_CLOCKLO;
+
+
+01:-:-:-:1      IADD clock1, clock2, -clock1;
+
+
+--:-:-:-:1      XMAD tid, blockDim, bid, tid;
+--:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
+--:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
+--:-:-:Y:6      SHL  tid, tid, 0x2;
+
+--:-:-:-:1      IADD clocks, clocks, tid;
+--:-:-:-:2      IADD out,  out,  tid;
+
+--:-:-:-:1      STG [clocks], clock1;
+--:-:-:-:1      STG [out],    result;
+--:-:-:-:5      EXIT;
+
+<COMMENT>
+
+--:-:-:-:4      LOP.AND tid32, tid, -32;
+
+--:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
+
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+
+03:-:-:-:6      LOP.AND  tid31, tid, 31;
+--:-:-:-:6      LOP.AND  tid32, tid, 32;
+--:-:-:-:6      SHL  tid32, tid32, 0x2;
+--:-:-:-:6      LOP.OR  tid32, tid32, tid31;
+--:-:-:-:6      SHL  tid32, tid32, 0x2;
+
+// readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
+--:-:-:-:1      LOP.AND readAs, tid,    0x80;
+--:-:-:-:1      SHR.U32 readAs, readAs, 4;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid7;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid1,   tid,    0x1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
+
+
+</COMMENT>
\ No newline at end of file
diff --git a/Assembler/MaxAs/microbench/throughput.pl b/Assembler/MaxAs/microbench/throughput.pl
new file mode 100755
index 0000000..56df6e7
--- /dev/null
+++ b/Assembler/MaxAs/microbench/throughput.pl
@@ -0,0 +1,80 @@
+#!/usr/bin/perl
+use strict;
+
+my $loopSize  = 512;
+my $blocks    = 32;
+my $loops     = 10240000;
+my $fileName  = 'throughput2.sass';
+
+writeSassFile($fileName, $loops);
+
+#print `maxas.pl -p $fileName`;
+#exit;
+
+print `maxas.pl -i $fileName microbench.cubin`;
+exit if $?;
+
+foreach my $thread128 (2)
+{
+    my $threads   = $thread128 * 128;
+    my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
+
+    my $data = `Release\\microbench.exe e $blocks $threads $fops`;
+
+    my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+    printf "%d %d %d\n", $thread128, $threads, $gflops;
+}
+
+exit;
+
+sub writeSassFile
+{
+    my ($filename, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'EOF', $loops;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+    0-10 : result, r1, r2, r3
+    20-27 ~ count, stop
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+--:-:-:-:1      MOV32I r1, 1.0;
+--:-:-:-:1      MOV32I r2, 1.0;
+--:-:-:-:4      MOV32I r3, 1.0;
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0 .. 511)
+    {
+        my $yield = ($i + 32) & 63 ? '-' : 'Y';
+
+        my $stall = $i == 511 ? 0 : 1;
+
+        $out .= "--:-:-:$yield:$stall      FFMA result, r1, r2, r3;\n";
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+EOF
+
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/MaxAs/microbench/throughput.sass b/Assembler/MaxAs/microbench/throughput.sass
new file mode 100644
index 0000000..796502f
--- /dev/null
+++ b/Assembler/MaxAs/microbench/throughput.sass
@@ -0,0 +1,95 @@
+# Kernel: microbench
+# InsCnt: 18
+# RegCnt: 5
+# SharedSize: 4096
+# BarCnt: 1
+# Params(3):
+#   ord:addr:size:align
+#   0:0x140:4:0
+#   1:0x144:4:0
+#   2:0x148:4:0
+
+<REGISTER_MAPPING>
+
+    8-20 : count
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV R0, RZ;
+--:-:-:-:1      MOV R1, RZ;
+--:-:-:-:1      MOV R2, RZ;
+--:-:-:-:1      MOV R3, RZ;
+--:-:-:-:1      MOV R4, RZ;
+--:-:-:-:1      MOV R5, RZ;
+--:-:-:-:1      MOV R6, RZ;
+--:-:-:-:1      MOV R7, RZ;
+--:-:-:-:1      MOV R8, RZ;
+--:-:-:Y:6      MOV count, RZ;
+
+// This loop is capable of running at 1700 GFlops on GM107.
+// You can tweak it to see how register bank conflicts or different control codes
+// effect performance.
+// With thoughput.pl you can pass params to this code and do some autotuning.
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, 0x19000, PT;
+--:-:-:-:1      IADD count, count, 0x1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0..511) #511
+    {
+        my $y = ($i + 32) & 63 ? '-' : 'Y';
+
+        $out .= qq|
+--:-:-:$y:1      FFMA R0, R1, R2, R3;|; #c[0x0][$c]
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+
+--:-:-:-:5      EXIT;
+
+<COMMENT>
+
+
+    open my $fh, 'params.txt';
+    my $line = <$fh>;
+    close $fh;
+    my ($r1, $r2, $r3) = split "\t", $line;
+
+    80-95 : out, clocks, in, tid, clock1, clock2, result
+
+
+--:-:1:-:1      S2R tid,   SR_TID.X;
+--:-:-:-:1      MOV out,    c[0x0][0x140];
+--:-:-:-:1      MOV clocks, c[0x0][0x144];
+01:-:-:-:1      MOV in,     c[0x0][0x148];
+
+
+
+--:-:-:-:1      MOV32I f0, 0x3f800000;
+--:-:-:-:1      MOV32I f1, 0x3f800000;
+--:-:-:-:1      MOV32I f2, 0x3f800000;
+--:-:-:-:5      MOV32I f3, 0x3f800000;
+
+--:-:-:-:1      CS2R clock1, SR_CLOCKLO;
+
+
+--:-:-:-:1      CS2R clock2, SR_CLOCKLO;
+
+--:-:-:-:6      MOV32I result, 0x457;
+--:-:-:-:1      IADD clock1, clock2, -clock1;
+
+
+--:-:-:-:6      SHL  tid, tid, 0x2;
+--:-:-:-:1      IADD clocks, clocks, tid;
+--:-:-:-:1      IADD out,  out,  tid;
+
+--:-:-:-:1      STG [clocks], clock1;
+--:-:-:-:1      STG [out],    R24;
+
+
+</COMMENT>
\ No newline at end of file
diff --git a/Assembler/MaxAs/microbench/throughput2.pl b/Assembler/MaxAs/microbench/throughput2.pl
new file mode 100755
index 0000000..ea7e19f
--- /dev/null
+++ b/Assembler/MaxAs/microbench/throughput2.pl
@@ -0,0 +1,158 @@
+#!/usr/bin/perl
+use strict;
+my %p;
+
+$p{N}         = 8192;
+$p{blocking}  = 8;
+$p{unroll}    = 8;
+$p{threads}   = 64;   #256
+
+$p{csize}     = $p{blocking} * $p{blocking};
+$p{loopSize}  = $p{unroll} * $p{csize};
+$p{width}     = sqrt($p{csize} * $p{threads});
+$p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
+$p{loops}     = $p{N} / $p{unroll};
+$p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
+
+my $fileName  = 'throughput2.sass';
+
+my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
+
+#print join("\t", @params), "\n";
+#print join("\t", @p{@params}), "\n";
+
+print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
+
+writeSassFile($fileName, $p{loopSize}, $p{loops});
+
+#print `maxas.pl -p $fileName`;
+#exit;
+
+print `maxas.pl -i $fileName microbench.cubin`;
+
+exit if $?;
+
+my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
+
+my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+print $data;
+
+#printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
+
+
+
+
+sub writeSassFile
+{
+    my ($filename, $loopSize, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'END_SASS', $loops;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
+    35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
+    39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
+    33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
+    37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
+
+    64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67>
+    80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67>
+
+    0-127 : r<0-127>
+
+    100-101 : count, stop
+
+    //102-112 ~ readAs, readBs, writeS
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+//--:-:-:-:1      MOV writeS, RZ;
+//--:-:-:-:1      MOV readAs, RZ;
+//--:-:-:-:1      MOV readBs, RZ;
+
+<CODE>
+    return join '', map "--:-:-:-:1      MOV32I r$_, 1.0;\n", 0..95;
+</CODE>
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+
+    my @cOrder;
+    #my @swirl = ([0,1],[0,0],[2,0],[2,1]);
+    my @swirl = ([2,0],[2,1],[0,1],[0,0]);
+    #my @swirl = ([0,1],[0,0],[1,0],[1,1]);
+    my @xVals = (0,1,64,65);
+    #my @xVals = (0,2,64,66);
+
+    my @yVals = (0,2,64,66);
+
+    foreach my $y (@yVals)
+    {
+        foreach my $x (@xVals)
+        {
+            push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl;
+        }
+        @xVals = reverse @xVals;
+    }
+
+    foreach my $j (0..7)
+    {
+        my $odd  = $j & 1;
+        my $nOdd = !$odd + 0;
+
+		my %%insert;
+
+        #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
+
+        $insert{c62} =
+                "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/;
+            my $ins    = $insert{"c$c"} || '';
+            my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
+            my $yield  = $c == 32 ? 'Y' : '-';
+            my $wait   = '--'; #$c ? '--' : '01';
+
+            $out .= "$wait:-:-:$yield:$stall      FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins";
+        }
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+END_SASS
+
+    close $fh;
+}
+
+__END__
+
+        my %%insert = (
+            c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
+            c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
+            c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
+            c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
+        );
\ No newline at end of file
diff --git a/Assembler/MaxAs/microbench/throughput2.sass b/Assembler/MaxAs/microbench/throughput2.sass
new file mode 100644
index 0000000..3db5130
--- /dev/null
+++ b/Assembler/MaxAs/microbench/throughput2.sass
@@ -0,0 +1,47 @@
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+    0-10 : result, r1, r2, r3
+    20-27 ~ count, stop
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, 102400;
+--:-:-:-:1      MOV32I r1, 1.0;
+--:-:-:-:1      MOV32I r2, 1.0;
+--:-:-:-:4      MOV32I r3, 1.0;
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0 .. 511)
+    {
+        my $yield = ($i + 32) & 63 ? '-' : 'Y';
+
+        my $stall = $i == 511 ? 0 : 1;
+
+        #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
+
+        #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
+        #$out .= "--:-:-:-:1      MOV result, RZ;\n";
+
+        $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
+        #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
+        #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
diff --git a/Assembler/MaxAs/microbench/throughput3.pl b/Assembler/MaxAs/microbench/throughput3.pl
new file mode 100755
index 0000000..ff9077a
--- /dev/null
+++ b/Assembler/MaxAs/microbench/throughput3.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/perl
+use strict;
+
+my %data;
+
+foreach my $thread128 (1 .. 8)
+{
+    foreach my $size64 (8 .. 16)
+    {
+        my $loopSize  = $size64 * 64;
+        my $loops     = int(2 * 1638400 / ($size64 * $thread128));
+
+        my $blocks    = 16;
+        my $threads   = $thread128 * 128;
+        my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
+        my $fileName  = 'throughput2.sass';
+
+        #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops;
+        #next;
+
+        writeSassFile($fileName, $loopSize, $loops);
+
+        `maxas.pl -i $fileName microbench.cubin`;
+
+        exit if $?;
+
+        my $data = `Release\\microbench.exe e $blocks $threads $fops`;
+
+        my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+        printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
+
+        push @{$data{$loopSize}}, $gflops;
+    }
+}
+print join("\t", 'size', 1 .. 8), "\n";
+foreach my $loopSize (sort {$a <=> $b} keys %data)
+{
+    print join("\t", $loopSize, @{$data{$loopSize}}), "\n";
+}
+
+exit;
+
+sub writeSassFile
+{
+    my ($filename, $loopSize, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'EOF', $loops, $loopSize, $loopSize;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+    0-10 : result, r1, r2, r3, count, stop
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+--:-:-:-:1      MOV32I r1, 1.0;
+--:-:-:-:1      MOV32I r2, 1.0;
+--:-:-:-:4      MOV32I r3, 1.0;
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0 .. %d)
+    {
+        my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y';
+
+        $out .= "--:-:-:$y:1      FFMA result, r1, r2, r3;\n";
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+EOF
+
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/MaxAs/microbench/throughput4.pl b/Assembler/MaxAs/microbench/throughput4.pl
new file mode 100755
index 0000000..8f8760c
--- /dev/null
+++ b/Assembler/MaxAs/microbench/throughput4.pl
@@ -0,0 +1,120 @@
+#!/usr/bin/perl
+use strict;
+
+my $loopSize  = 512;
+my $blocks    = 64;
+my $loops     = 102400;
+my $fileName  = 'throughput2.sass';
+
+writeSassFile($fileName, $loops);
+
+#print `maxas.pl -p $fileName`;
+#exit;
+
+print `maxas.pl -i $fileName microbench.cubin`;
+exit if $?;
+
+foreach my $thread128 (4)
+{
+    my $threads   = $thread128 * 128;
+    my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
+
+    print "./microbench e $blocks $threads $fops\n\n";
+    my $data = `./microbench e $blocks $threads $fops`;
+    exit($?) if $?;
+
+    my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+    printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0;
+}
+
+exit;
+
+sub writeSassFile
+{
+    my ($filename, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'EOF', $loops;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+    0-10 : result, r1, r2, r3
+    20-27 ~ count, stop
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+--:-:-:-:1      MOV32I r1, 1.0;
+--:-:-:-:1      MOV32I r2, 1.0;
+--:-:-:-:4      MOV32I r3, 1.0;
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0 .. 511)
+    {
+        my $yield = ($i + 32) & 63 ? '-' : 'Y';
+
+        my $stall = $i == 511 ? 0 : 1;
+
+        #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
+
+        #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
+        #$out .= "--:-:-:-:1      MOV result, RZ;\n";
+
+        $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
+        #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
+        #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+EOF
+
+    close $fh;
+}
+
+__END__
+
+VMAD.U8.U8
+
+dddd 2655 / 4968 = 53.4%
+1d1d 4594 / 4968 = 92.4%
+11d  4746 / 4968 = 95.5%
+111d 4841 / 4968 = 97.4%
+
+block context switches are a little more expensive than thread context switches
+
+stall codes:
+
+f : 13 clocks
+e :  8 clocks
+d :  6 clocks
+c :  8 clocks, no yield
+b : 11 clocks
+a : 10 clocks
+9 :  9 clocks
+8 :  8 clocks
+7 :  7 clocks
+6 :  6 clocks
+5 :  5 clocks
+4 :  4 clocks
+3 :  3 clocks
+2 :  2 clocks
+1 :  1 clocks,  no yield
+0 :  0 clocks,  no yield, dual issue
\ No newline at end of file
diff --git a/Assembler/MaxAs/microbench/throughput5.pl b/Assembler/MaxAs/microbench/throughput5.pl
new file mode 100755
index 0000000..f9bda8e
--- /dev/null
+++ b/Assembler/MaxAs/microbench/throughput5.pl
@@ -0,0 +1,164 @@
+#!/usr/bin/perl
+use strict;
+my %p;
+
+$p{N}         = 8192;
+$p{blocking}  = 8;
+$p{unroll}    = 8;
+$p{threads}   = 64;   #256
+
+$p{csize}     = $p{blocking} * $p{blocking};
+$p{loopSize}  = $p{unroll} * $p{csize};
+$p{width}     = sqrt($p{csize} * $p{threads});
+$p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
+$p{loops}     = $p{N} / $p{unroll};
+$p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
+
+my $fileName  = 'throughput2.sass';
+
+my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
+
+#print join("\t", @params), "\n";
+#print join("\t", @p{@params}), "\n";
+
+print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
+
+writeSassFile($fileName, $p{loopSize}, $p{loops});
+
+#print `maxas.pl -p $fileName`;
+#exit;
+
+print `maxas.pl -i $fileName microbench.cubin`;
+
+exit if $?;
+
+my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
+
+my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+print $data;
+
+#printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
+
+
+
+
+sub writeSassFile
+{
+    my ($filename, $loopSize, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'END_SASS', $loops;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+     1, 9, 2,10,17,25,18,26 : cy0x<0-7>
+     5,13, 6,14,21,29,22,30 : cy1x<0-7>
+     3,11, 0, 8,19,27,16,24 : cy2x<0-7>
+     7,15, 4,12,23,31,20,28 : cy3x<0-7>
+    35,43,32,40,51,59,48,56 : cy4x<0-7>
+    39,47,36,44,55,63,52,60 : cy5x<0-7>
+    33,41,34,42,49,57,50,58 : cy6x<0-7>
+    37,45,38,46,53,61,54,62 : cy7x<0-7>
+
+    64-71   : j0Ax<0-3>, j0By<0-3>
+    72-79   : j1Ax<0-3>, j1By<0-3>
+
+    0-79 : r<0-79>
+
+    100-101 : count, stop
+
+    //102-112 ~ readAs, readBs, writeS
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+//--:-:-:-:1      MOV writeS, RZ;
+//--:-:-:-:1      MOV readAs, RZ;
+//--:-:-:-:1      MOV readBs, RZ;
+
+<CODE>
+    return join '', map "--:-:-:-:1      MOV r$_, RZ;\n", 0..63;
+</CODE>
+
+<CODE>
+    return join '', map "--:-:-:-:1      MOV32I r$_, 0x00010001;\n", 64..79;
+</CODE>
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    my @swirl1 = ([0,0],[0,4],[4,4],[4,0]);
+    my @swirl2 = ([0,0],[1,0],[1,1],[0,1]);
+    my @swirl3 = ([0,2],[2,2],[2,0],[0,0]);
+
+    my @cOrder;
+    foreach my $s1 (@swirl1)
+    {
+        foreach my $s2 (@swirl2)
+        {
+            foreach my $s3 (@swirl3)
+            {
+                push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]];
+            }
+        }
+    }
+
+    foreach my $j (0..7)
+    {
+        my $odd  = $j & 1;
+        my $nOdd = !$odd + 0;
+
+        my %%insert;
+
+        #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
+
+        $insert{c62} =
+                "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+            my $ins    = $insert{"c$c"} || '';
+            my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
+            my $yield  = $c == 32 ? 'Y' : '-';
+            my $wait   = '--'; #$c ? '--' : '01';
+
+            my $xReg  = $x >> 1;
+            my $yReg  = $y >> 1;
+            my $xPart = $x & 1 ? '.H1' : '';
+            my $yPart = $y & 1 ? '.H1' : '';
+
+            $out .= sprintf "$wait:-:-:$yield:$stall      XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x,  $odd,$xReg,$xPart,  $odd,$yReg,$yPart,  $y,$x,  $ins;
+        }
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+END_SASS
+
+    close $fh;
+}
+
+__END__
+
+        my %%insert = (
+            c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
+            c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
+            c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
+            c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
+        );
\ No newline at end of file
diff --git a/Assembler/MaxAs/microbench/xmad.pl b/Assembler/MaxAs/microbench/xmad.pl
new file mode 100755
index 0000000..6aadb89
--- /dev/null
+++ b/Assembler/MaxAs/microbench/xmad.pl
@@ -0,0 +1,12 @@
+#!/usr/bin/perl
+use strict;
+
+print `maxas.pl -i xmad2.sass microbench.cubin`;
+
+exit if $?;
+
+print `./microbench i 1 128`;
+
+
+__END__
+
diff --git a/Assembler/MaxAs/microbench/xmad2.sass b/Assembler/MaxAs/microbench/xmad2.sass
new file mode 100644
index 0000000..f0ce936
--- /dev/null
+++ b/Assembler/MaxAs/microbench/xmad2.sass
@@ -0,0 +1,144 @@
+# Kernel: microbench
+# InsCnt: 18
+# RegCnt: 5
+# SharedSize: 4096
+# BarCnt: 1
+# Params(3):
+#	ord:addr:size:align
+#	0:0x140:8:0
+#	1:0x148:8:0
+#	2:0x150:8:0
+#
+# Instructions:
+
+<CONSTANT_MAPPING>
+    blockDimX : c[0x0][0x8]
+    blockDimY : c[0x0][0xc]
+    blockDimZ : c[0x0][0x10]
+    gridDimX : c[0x0][0x14]
+    gridDimY : c[0x0][0x18]
+    gridDimZ : c[0x0][0x1c]
+
+    param_out[0] : c[0x0][0x140]
+    param_out[1] : c[0x0][0x144]
+    param_clocks[0] : c[0x0][0x148]
+    param_clocks[1] : c[0x0][0x14c]
+    param_in[0] : c[0x0][0x150]
+    param_in[1] : c[0x0][0x154]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+	0-1 : out<0-1>
+	2-3 : clocks<0-1>
+    4-15  : result, result2, tid, bid, blockDim, clock1, clock2, scale, s
+    16-24 : a, b, c, x
+
+</REGISTER_MAPPING>
+
+// Load in our params
+--:-:-:-:1      MOV out0,      param_out[0];
+--:-:-:-:1      MOV out1,      param_out[1];
+--:-:-:-:1      MOV clocks0,   param_clocks[0];
+--:-:-:-:1      MOV clocks1,   param_clocks[1];
+//--:-:-:-:1      MOV in,       c[0x0][0x148];
+--:-:-:-:1      MOV blockDim, blockDimX;
+
+--:-:-:-:1      PSETP.AND.AND P0, PT, !PT, PT, PT;
+
+--:-:-:-:6      MOV32I result,  0xffffffff;
+--:-:-:-:6      MOV32I result2, 0x0;
+--:-:-:-:1      MOV32I a, 1;
+--:-:-:-:1      MOV32I b, 1;
+--:-:-:-:6      MOV32I c, 0x0;
+
+// (127 - scale) << 23
+//--:-:-:-:6      MOV32I scale, 28;
+//--:-:-:-:6      IADD scale, -scale, 127;
+//--:-:-:-:6      SHL  scale, scale, 23;
+
+
+//--:-:-:-:6      MOV32I c, 0x4f765432;
+
+//--:-:1:-:2      LDG.CI.128 a, [in];
+
+//01:-:-:-:6      VMAD.S16.S16 result, a, b, c;
+
+//--:-:-:-:6      MOV result, a;
+
+// a >> 16 | (b & 0xffff0000)
+
+//--:-:-:-:6      SHR.U32 result, a, 16;
+//--:-:-:-:6      LOP3.LUT result, result, b, c, 0xf8;
+
+//--:-:-:-:6      I2I.S32.S16 result, a.H1;
+
+//--:-:-:Y:d      IADD result.CC, a, -c;
+//--:-:-:Y:2      IADD.X result2, b, -RZ;
+
+//--:-:-:-:6      SHR result, a, 1;
+
+//--:-:-:-:6      BFI result, b, 0x1010, a;
+
+--:-:-:-:1      CS2R clock1, SR_CLOCKLO;
+
+//--:-:-:-:6      XMAD.S16.S16 c, a, b, RZ;
+//--:-:-:-:6      ISET.LT.AND s, c, RZ, PT;
+//--:-:-:-:6      IADD result.CC, c, result;
+//--:-:-:-:6      IADD.X result2, s, result2;
+
+//--:-:-:-:6      XMAD.S16.S16 result.CC, a, b, result;
+//--:-:-:-:6      IADD.X result2, result2, RZ;
+
+//--:-:-:-:6      SHF.R.S64 result, result, 1, result2;
+//--:-:-:-:6      MOV32I result2, 0;
+
+--:-:-:-:f      LOP.AND.NZ P0, RZ, result, 1;
+
+--:-:-:-:6  @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result;
+
+//--:-:1:-:d      I2F.F32.S32 result2, a;
+//01:-:-:-:6      FMUL result2, result2, scale;
+//01:-:2:-:d      F2I.S32.F32 result, result2;
+
+02:-:-:-:6      CS2R clock2, SR_CLOCKLO;
+
+//F2I   = "^$pred?F2I$ftz$x2x$round $r0, $cr20;"
+//I2F   = "^$pred?I2F$x2x$rnd $r0, $cr20;"
+//x2x   = "\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)"
+//rnd   = "(?:\.(?<rnd>RN|RM|RP|RZ))?"
+//round = "(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?"
+//r8    = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B1|B2|B3))?(?<reuse1>\.reuse)?"
+//r20   = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B1|B2|B3))?(?<reuse2>\.reuse)?"
+
+
+//--:-:-:-:1      XMAD.MRG x, a, b.H1, RZ;
+//--:-:-:-:6      XMAD result, a.H1, b.H1, c;
+//--:-:-:-:1      XMAD.PSL.CBCC result, a.H1, x.H1, result;
+
+// Get the first clock value
+
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:2:-:2      S2R bid, SR_CTAID.X;
+
+
+
+// Take the difference of clocks
+--:-:-:-:1      IADD clock1, clock2, -clock1;
+
+// Setup our output addresses
+// Stall your pipeline dependencies properly
+03:-:-:-:1      XMAD tid, blockDim, bid, tid;
+--:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
+--:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
+--:-:-:Y:6      SHL  tid, tid, 0x2;
+
+--:-:-:-:1      IADD clocks, clocks, tid;
+--:-:-:-:1      IADD out,  out,  tid;
+
+// Output the results.
+// No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
+--:-:-:-:1      STG.E [clocks], result2;
+--:-:-:-:1      STG.E [out],    result;
+--:-:-:-:5      EXIT;
+
diff --git a/Assembler/MaxAs/pm_to_blib b/Assembler/MaxAs/pm_to_blib
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/MaxAs/sgemm/batched_gemm.xlsx b/Assembler/MaxAs/sgemm/batched_gemm.xlsx
new file mode 100644
index 0000000..c88f0a7
Binary files /dev/null and b/Assembler/MaxAs/sgemm/batched_gemm.xlsx differ
diff --git a/Assembler/MaxAs/sgemm/cublas_sgemm.ptx b/Assembler/MaxAs/sgemm/cublas_sgemm.ptx
new file mode 100644
index 0000000..8edec86
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/cublas_sgemm.ptx
@@ -0,0 +1,65 @@
+.version 4.1
+.target sm_50
+.address_size 64
+
+// ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx
+
+// You can use maxas to insert cublas_device.lib code into a cubin built from this ptx:
+
+// From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib
+
+// cuobjdump -lelf cublas_device.lib | find "sm_50"
+
+// cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib
+
+// maxas -l maxwell_sgemm.asm.sm_50.cubin
+
+// maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass
+// maxas -e -k maxwell_sgemm_128x64_nt  maxwell_sgemm_128x64_nt.sass
+
+// maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin
+// maxas -i maxwell_sgemm_128x64_nt.sass  cublas_sgemm.cubin
+
+// The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas.
+
+.visible .entry maxwell_sgemm_128x128_nt(
+	.param .u64 .ptr.global.align 8 param_A,
+	.param .u64 .ptr.global.align 8 param_B,
+	.param .u64 .ptr.global.align 8 param_C,
+	.param .s32 param_lda,
+	.param .s32 param_ldb,
+	.param .s32 param_ldc,
+	.param .s32 param_k,
+	.param .u64 .ptr.global.align 8 param_Alpha,
+	.param .u64 .ptr.global.align 8 param_Beta,
+	.param .s32 param_alpha,
+	.param .s32 param_beta,
+	.param .s32 param_flag
+)
+.reqntid 256
+{
+	.shared .align 16 .b8 share[16384];
+
+	ret;
+}
+
+.visible .entry maxwell_sgemm_128x64_nt(
+	.param .u64 .ptr.global.align 8 param_A,
+	.param .u64 .ptr.global.align 8 param_B,
+	.param .u64 .ptr.global.align 8 param_C,
+	.param .s32 param_lda,
+	.param .s32 param_ldb,
+	.param .s32 param_ldc,
+	.param .s32 param_k,
+	.param .u64 .ptr.global.align 8 param_Alpha,
+	.param .u64 .ptr.global.align 8 param_Beta,
+	.param .s32 param_alpha,
+	.param .s32 param_beta,
+	.param .s32 param_flag
+)
+.reqntid 128
+{
+	.shared .align 16 .b8 share[12288];
+
+	ret;
+}
diff --git a/Assembler/MaxAs/sgemm/new.cubin b/Assembler/MaxAs/sgemm/new.cubin
new file mode 100644
index 0000000..6a1572b
Binary files /dev/null and b/Assembler/MaxAs/sgemm/new.cubin differ
diff --git a/Assembler/MaxAs/sgemm/sgemm.cpp b/Assembler/MaxAs/sgemm/sgemm.cpp
new file mode 100644
index 0000000..f2127d8
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm.cpp
@@ -0,0 +1,480 @@
+// sgemm.cpp : Defines the entry point for the console application.
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+
+CUcontext      hContext = 0;
+cublasHandle_t hCublas  = 0;
+
+float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat = 1, int printVars = 0);
+float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat);
+void gflops(const char* ident, int N, float ms, int repeat);
+void test(float* C, float* T, int N, size_t size);
+
+#define REPEAT_BLOCK 2000
+
+#define CUDA_CHECK( fn ) do { \
+		CUresult status = (fn); \
+		if ( CUDA_SUCCESS != status ) { \
+			const char* errstr; \
+			cuGetErrorString(status, &errstr); \
+			printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \
+			if (hCublas)  cublasDestroy(hCublas); \
+			if (hContext) cuCtxDestroy(hContext); \
+			exit(EXIT_FAILURE); \
+		} \
+	} while (0)
+
+#define CUBLAS_CHECK( fn ) do { \
+		cublasStatus_t status = (fn); \
+		if ( CUBLAS_STATUS_SUCCESS != status ) { \
+			printf("Cublas Failure (line %d of file %s):\n\t%s returned %d\n", __LINE__, __FILE__, #fn, status); \
+			if (hCublas)  cublasDestroy(hCublas); \
+			if (hContext) cuCtxDestroy(hContext); \
+			exit(EXIT_FAILURE); \
+		} \
+	} while (0)
+
+int main(int argc, char* argv[])
+{
+	char deviceName[32];
+	int count, ordinal, major, minor;
+	CUdevice  hDevice;
+	CUevent hStart, hStop;
+	CUdeviceptr devA, devB, devC, devT, otherDevA, otherDevB;
+
+	// Initialize the Driver API and find a device
+	CUDA_CHECK( cuInit(0) );
+	CUDA_CHECK( cuDeviceGetCount(&count) );
+	for (ordinal = 0; ordinal < count; ordinal++)
+	{
+		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
+		CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) );
+		CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) );
+		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
+		if (major >= 5 && minor >= 2)
+		{
+			//printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
+			break;
+		}
+	}
+	if (ordinal == count)
+	{
+		printf("No compute 5.0 device found, exiting.\n");
+		exit(EXIT_FAILURE);
+	}
+
+	// First command line arg is the size of N divided by 128
+	int thread128 = 64;
+	if (argc > 1)
+		thread128 = atoi(argv[1]);
+	if (thread128 > 64 || thread128 < 1)
+		thread128 = 64;
+
+	// Second command line arg is the repeat count for benchmarking
+	int repeat = 1;
+	if (argc > 2)
+		repeat = atoi(argv[2]);
+	if (repeat > 10000 || repeat < 1)
+		repeat = 1;
+
+	// Third command line arg is the normalized float size
+	CUarray_format format = CU_AD_FORMAT_FLOAT;
+	if (argc > 3)
+		format = (CUarray_format)atoi(argv[3]);
+	if (format != CU_AD_FORMAT_FLOAT && format != CU_AD_FORMAT_UNSIGNED_INT16 && format != CU_AD_FORMAT_UNSIGNED_INT8)
+		format = CU_AD_FORMAT_FLOAT;
+
+	// Forth command line arg is for printf debugging 
+	int printVars = 0;
+	if (argc > 4)
+		printVars = atoi(argv[4]);
+	if (printVars > 100 || printVars < 1)
+		printVars = 0;
+
+	int N = thread128 * 128;
+	float alpha = 1, beta = 0, ms = 1;
+	size_t sizeOther = N * N;
+	size_t sizeFloat = sizeOther * 4;
+
+	float* A = (float*)malloc(sizeFloat);
+	float* B = (float*)malloc(sizeFloat);
+	float* C = (float*)malloc(sizeFloat);
+	float* T = (float*)malloc(sizeFloat);  
+	float *otherA, *otherB; 
+
+	//int counter = 0;
+	//srand((unsigned int)time(0));
+	for(int i = 0; i < N * N; i++) //
+	{
+		//A[i] = (float)rand() / (float)RAND_MAX;
+		//B[i] = (float)rand() / (float)RAND_MAX;
+		A[i] = B[i] = 1.0f; // * (i & 3) + 1.0f;
+		//A[i] = 1.0f;
+		//B[i * N + counter++] = 1.0f; // identity matrix
+	}
+
+	if (format == CU_AD_FORMAT_FLOAT)
+	{
+		sizeOther *= 4;
+		otherA = A;
+		otherB = B;
+	}
+	else if (format == CU_AD_FORMAT_UNSIGNED_INT16)
+	{
+		sizeOther *= 2;
+		unsigned short* othera = (unsigned short*)malloc(sizeOther);
+		unsigned short* otherb = (unsigned short*)malloc(sizeOther);
+		for(int i = 0; i < N * N; i++)
+			othera[i] = otherb[i] = 65535;
+
+		otherA = reinterpret_cast<float*>(othera);
+		otherB = reinterpret_cast<float*>(otherb);
+	}
+	else // (format == CU_AD_FORMAT_UNSIGNED_INT8)
+	{
+		otherA = (float*)malloc(sizeOther);
+		otherB = (float*)malloc(sizeOther);
+		memset(otherA, 255, sizeOther);
+		memset(otherB, 255, sizeOther); 
+	}
+
+	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
+	//CUBLAS_CHECK( cublasCreate(&hCublas) );
+	
+	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); // CU_EVENT_DEFAULT 
+	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );
+
+	CUDA_CHECK( cuMemAlloc(&devA, sizeFloat) );
+	CUDA_CHECK( cuMemAlloc(&devB, sizeFloat) );
+	CUDA_CHECK( cuMemAlloc(&devC, sizeFloat) );
+	CUDA_CHECK( cuMemAlloc(&devT, sizeFloat) );
+	
+	CUDA_CHECK( cuMemcpyHtoD(devA, A, sizeFloat) );
+	CUDA_CHECK( cuMemcpyHtoD(devB, B, sizeFloat) );
+	CUDA_CHECK( cuMemsetD8(devC, 0, sizeFloat) );
+	CUDA_CHECK( cuMemsetD8(devT, 0, sizeFloat) );
+
+	if (format == CU_AD_FORMAT_FLOAT)
+	{
+		otherDevA = devA;
+		otherDevB = devB;
+	}
+	else
+	{
+		CUDA_CHECK( cuMemAlloc(&otherDevA, sizeOther) );
+		CUDA_CHECK( cuMemAlloc(&otherDevB, sizeOther) );
+		CUDA_CHECK( cuMemcpyHtoD(otherDevA, otherA, sizeOther) );
+		CUDA_CHECK( cuMemcpyHtoD(otherDevB, otherB, sizeOther) );
+	}
+
+	// Warm up the clock (unless under nsight)
+	//if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER 
+	//	for (int i = 0; i < 3; i++)
+	//		CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast<float*>(devA), N, reinterpret_cast<float*>(devB), N, &beta, reinterpret_cast<float*>(devT), N) );
+
+	// Launch our kernel
+	ms = assemblySgemm("sgemm_kernel_64", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars);
+	gflops("Max64 ", N, ms, repeat);
+
+	ms = assemblySgemm("sgemm_kernel_128", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars);
+	gflops("Max128", N, ms, repeat);
+
+	//ms = cublasSgemm("maxwell_sgemm_128x64_nt", devT, devA, devB, N, hStart, hStop, repeat);
+	//gflops("Cub64 ", N, ms, repeat);
+
+	//ms = cublasSgemm("maxwell_sgemm_128x128_nt", devT, devA, devB, N, hStart, hStop, repeat);
+	//gflops("Cub128", N, ms, repeat);
+
+	// Run cublas again for the same repeat count for comparison
+	//CUDA_CHECK( cuEventRecord(hStart, NULL) );
+	//for (int i = 0; i < repeat; i++)
+	//	CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast<float*>(devA), N, reinterpret_cast<float*>(devB), N, &beta, reinterpret_cast<float*>(devT), N) );
+	//CUDA_CHECK( cuEventRecord(hStop, NULL) );
+	//CUDA_CHECK( cuEventSynchronize(hStop) );
+	//CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
+	//gflops("Cublas", N, ms, repeat);
+
+	// Get back our results from each kernel
+	CUDA_CHECK( cuMemcpyDtoH(C, devC, sizeFloat) );
+	CUDA_CHECK( cuMemcpyDtoH(T, devT, sizeFloat) );
+	
+	// Cleanup and shutdown of cuda
+	CUDA_CHECK( cuMemFree(devA) );
+	CUDA_CHECK( cuMemFree(devB) );
+	CUDA_CHECK( cuMemFree(devC) );
+	CUDA_CHECK( cuMemFree(devT) );
+	if (format != CU_AD_FORMAT_FLOAT)
+	{
+		CUDA_CHECK( cuMemFree(otherDevA) );
+		CUDA_CHECK( cuMemFree(otherDevB) );
+	}
+
+	CUDA_CHECK( cuEventDestroy(hStart) );
+	CUDA_CHECK( cuEventDestroy(hStop) );
+
+	//CUBLAS_CHECK( cublasDestroy(hCublas) );
+	//hCublas  = 0;
+	CUDA_CHECK( cuCtxDestroy(hContext) );
+	hContext = 0;
+
+	// compare C and T for accuracy
+	test(C, T, N, sizeFloat);
+
+	// And free up host memory
+	free(A); free(B); free(C); free(T);
+
+	if (format != CU_AD_FORMAT_FLOAT)
+	{
+		free(otherA); 
+		free(otherB);
+	}
+
+	return 0;
+}
+
+// Our kernel wrapper function
+float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat, int printVars)
+{
+	// Configure our x and y grid dimensions (assume nice square matrixes).
+	// Each block gets 128 tracks from A and 128 tracks from B.
+	// Each of the 256 threads calculates 64 elements of that 128x128 sub matrix of C.
+	// See Figure 2 here to get the gist of things (we use a different mapping to maximize LDS.128 usage):
+	// http://icl.cs.utk.edu/projectsfiles/magma/pubs/fermi_gemm.pdf
+
+	int threads, width;
+	if (strcmp(kernel, "sgemm_kernel_64") == 0)
+	{
+		threads = 64;
+		width   = 64;
+	}
+	else
+	{
+		threads = 256;
+		width   = 128;
+	}
+
+	int gridDimXY = N / width + (N % width != 0);
+	int blocks    = gridDimXY * gridDimXY;
+
+	// Setup out debug printf output buffer
+	CUdeviceptr devD = NULL; 
+	int* D = NULL;
+	int  sizeD = 0;
+
+	if (printVars)
+	{
+		sizeD = blocks * threads * printVars * sizeof(int);
+		D = (int*)malloc(sizeD);
+
+		CUDA_CHECK( cuMemAlloc(&devD, sizeD) );
+		CUDA_CHECK( cuMemsetD8(devD, 0, sizeD) );
+	}
+
+	// Load the cubin
+	CUmodule hModule;
+	CUDA_CHECK( cuModuleLoad(&hModule, "sgemm.cubin") );
+
+	// Load the textures
+	CUtexref texA, texB;
+	CUDA_CHECK( cuModuleGetTexRef(&texA, hModule, "texA") );
+	CUDA_CHECK( cuModuleGetTexRef(&texB, hModule, "texB") );
+
+	// Configure the textures
+	CUDA_CHECK( cuTexRefSetFormat(texA, format, 4) );
+	CUDA_CHECK( cuTexRefSetFormat(texB, format, 4) );
+
+	CUDA_CHECK( cuTexRefSetAddress(NULL, texA, devA, size) );
+	CUDA_CHECK( cuTexRefSetAddress(NULL, texB, devB, size) );
+
+	// Load the kernel function
+	CUfunction hKernel;
+	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) );
+
+	// Setup the params
+	float alpha = 1.0f;
+	void* params[] = { &devC, &N, &N, &N, &N, &N, &N, &alpha, &devD };
+
+	float totalTime = 0;
+	// Launch the kernel repeat times.. but break it up into pieces so as not to lock things up.
+	while (repeat > 0)
+	{
+		float ms;
+		int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat;
+		CUDA_CHECK( cuEventRecord( hStart, NULL ) );
+		
+		for (int i = 0; i < r; i++)
+			CUDA_CHECK( cuLaunchKernel(hKernel, gridDimXY, gridDimXY, 1, threads, 1, 1, 0, 0, params, 0) );
+		
+		CUDA_CHECK( cuEventRecord( hStop, NULL ) );
+		CUDA_CHECK( cuEventSynchronize( hStop ) );
+		CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) );
+		totalTime += ms;
+		repeat -= r;
+	}
+
+
+	CUDA_CHECK( cuModuleUnload(hModule) );
+
+	// And here we print out the debug info if requested:
+	if (printVars)
+	{
+		CUDA_CHECK( cuMemcpyDtoH(D, devD, sizeD) );
+		CUDA_CHECK( cuMemFree(devD) );
+		int   *iD = D;
+		float *fD = reinterpret_cast<float*>(D);
+		unsigned int *uD = reinterpret_cast<unsigned int*>(D);
+
+		for (int by = 0; by < gridDimXY; by++)
+		{
+			for (int bx = 0; bx < gridDimXY; bx++)
+			{
+				unsigned int clock = 0xffffffff, sm = 0;
+
+				for (int tid = 0; tid < threads; tid++)
+				{
+					//printf("by: %3d, bx: %3d, tid:%3d, rA:%5d, rB:%5d, wr:%5d, rd:%5d, cx:%5d, cy:%5d, ci:%5d, c:%.2f\n", 
+					//printf("by: %3d, bx: %3d, tid:%3d, t0:%5d, end:%5d, k:%5d, tid2:%5d, tid15:%5d, ldx:%5d, t2:%5d, t4:%5d\n", 
+					//	    by,      bx,      tid,     iD[0],  iD[1],   iD[2], iD[3],    iD[4],     iD[5],   iD[6],  iD[7]
+					//);
+					if (uD[1] < clock) clock = uD[1];
+					sm = uD[0];
+
+					iD += printVars;
+					fD += printVars;
+					uD += printVars;
+				}
+				printf("%02d %08u %d %d\n", sm, clock, by, bx);
+			}
+		}
+		free(D);
+	}
+
+	return totalTime;
+}
+
+typedef struct dPointer
+{
+	CUdeviceptr lo;
+	CUdeviceptr hi;
+} dPointer;
+
+float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat)
+{
+	int threads, gridX, gridY;
+	if (strcmp(kernel, "maxwell_sgemm_128x64_nt") == 0)
+	{
+		threads = 128;
+		gridX = N / 128 + (N % 128 != 0);
+		gridY = N / 64  + (N % 64  != 0);
+	}
+	else
+	{
+		threads = 256;
+		gridX = gridY = N / 128 + (N % 128 != 0);
+	}
+	int blocks = gridX * gridY;
+
+	// Load the cubin
+	// See cublas_sgemm.ptx for info on how to build this.
+	CUmodule hModule;
+	CUDA_CHECK( cuModuleLoad(&hModule, "cublas_sgemm.cubin") );
+
+	// Load the kernel function
+	CUfunction hKernel;
+	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) );
+
+	// Setup the params
+	// I should probably be working in 64 bits...
+	dPointer dA = { devA, 0 };
+	dPointer dB = { devB, 0 };
+	dPointer dC = { devC, 0 };
+
+	int   flag  = 0;
+	float alpha = 1.0;
+	float beta  = 0.0;
+	
+	void* params[] = { &dA, &dB, &dC, &N, &N, &N, &N, &dA, &dA, &alpha, &beta, &flag };
+
+	float totalTime = 0;
+	// Launch the kernel repeat times.. but break it up into pieces so as not to lock things up.
+	while (repeat > 0)
+	{
+		float ms;
+		int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat;
+		CUDA_CHECK( cuEventRecord( hStart, NULL ) );
+		
+		for (int i = 0; i < r; i++)
+			CUDA_CHECK( cuLaunchKernel(hKernel, gridX, gridY, 1, threads, 1, 1, 0, 0, params, 0) );
+		
+		CUDA_CHECK( cuEventRecord( hStop, NULL ) );
+		CUDA_CHECK( cuEventSynchronize( hStop ) );
+		CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) );
+		totalTime += ms;
+		repeat -= r;
+	}
+
+
+	CUDA_CHECK( cuModuleUnload(hModule) );
+
+	return totalTime;
+}
+
+void gflops(const char* ident, int N, float ms, int repeat)
+{
+	// Standard sgemm flops formula
+	ms /= repeat;
+	printf("%s GFLOPS: %.2f (size: %d, iterations: %d)\n", ident, ((double)N * N * N * 2.0 + N * N) / (ms * 1000000.0), N, repeat);
+}
+
+void test(float* C, float* T, int N, size_t size)
+{
+	// Compare our implementation with the cublas result
+	int errors = memcmp(C, T, size);
+	if (errors)
+	{
+		if (N <= 512) // This gets too big and slow for large N
+		{
+			errors = 0;
+			FILE* file;
+			if (fopen_s(&file, "data.txt", "w") == 0)
+			{
+				for (int y = 0; y < N; ++y)
+				{
+					for (int x = 0; x < N; ++x)
+					{
+						float c = C[x*N + y];
+						float t = T[x*N + y];
+						if (c != t)
+						{
+							errors++;
+							fprintf(file, "%.8f!%.8f\t", c , t);
+							//fprintf(file, "%.0f!", c);
+							//fprintf(file, "!");
+						}
+						else
+						{
+							//fprintf(file, "%.0f=%.0f\t", c , t);
+							//fprintf(file, "%.0f=", c);
+							fprintf(file, "=");
+						}
+					}
+					fprintf(file, "\n");
+				}
+				fclose(file);
+				printf("%d errors\n", errors);
+			}
+			else
+				{ printf("Cannot open data.txt for writing\n"); }
+		}
+		else
+			{ printf("%d errors\n", errors); }
+	}
+	else
+		{ printf("%d errors\n", errors); }
+}
\ No newline at end of file
diff --git a/Assembler/MaxAs/sgemm/sgemm.cu b/Assembler/MaxAs/sgemm/sgemm.cu
new file mode 100644
index 0000000..ce8b2a6
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm.cu
@@ -0,0 +1,105 @@
+
+// Note this file isn't configured to automatically compile.
+// Here's how:
+
+// If you want to look at the ptx first:
+// nvcc -arch sm_50 -m 32 -ptx sgemm.cu
+
+// Manually compile your kernel to a cubin.
+// You should only have to do this once, unless you change params or shared size or globals:
+// nvcc -arch sm_50 -m 32 -cubin sgemm.cu
+
+// If tweaking a kernel or writing a new one based on this shell code you would then do this:
+// maxas.pl -e kernel.cubin kernel.sass
+
+// I've already included a modified kernel (sgemm.sass) so the next step is..
+
+// Splice the manually assembled code back into the cubin:
+// maxas.pl -i sgemm.sass sgemm.cubin
+
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#include <cuda_texture_types.h>
+#include <texture_fetch_functions.h>
+
+typedef texture<float4, cudaTextureType1D, cudaReadModeElementType> floatTex;
+
+floatTex  texA(0, cudaFilterModePoint, cudaAddressModeBorder);
+floatTex  texB(0, cudaFilterModePoint, cudaAddressModeBorder);
+
+// Use extern C so C++ doesn't mangle our kernel name
+extern "C"
+// This kernel requires 256x1x1 threads per block
+__global__ void __launch_bounds__(256) sgemm_kernel_128(
+	float *C,
+	const int m,   const int n,   const int k,
+	const int lda, const int ldb, const int ldc,
+	float alpha, int *D)
+{
+	// Declare any shared memory your kernel requires
+	// Or you could just pass the amount in as a param to cuLaunchKernel
+	__shared__ float4 share[1024];
+
+	int tid = threadIdx.x;
+
+	// If you use indirect texture references, they will be passed as params at the end of the param list
+	// So set that up here to make sure they're available in your kernel
+	floatTex tex = tid > 127 ? texB : texA;
+
+	// Make use of shared and your textures so it doesn't get optimized away
+	share[tid] = tex1Dfetch(tex, tid);
+
+	__syncthreads();
+
+	// output something so your setup isn't optimized away.
+	C[tid] = share[255-tid].x;
+}
+
+extern "C"
+__global__ void __launch_bounds__(64) sgemm_kernel_64(
+	float *C,
+	const int m,   const int n,   const int k,
+	const int lda, const int ldb, const int ldc,
+	float alpha, int *D)
+{
+	__shared__ float4 share[512];
+
+	int tid = threadIdx.x;
+
+	floatTex tex = tid > 127 ? texB : texA;
+
+	share[tid] = tex1Dfetch(tex, tid);
+
+	__syncthreads();
+
+	C[tid] = share[255-tid].x;
+}
+
+// A note about using the Cuda Runtime.
+// If that's your preference over the driver API then here's what you'd do:
+
+// In your project properties in the Cuda C/C++ panel:
+//    -Set the "Keep Processed Files" (-keep) option
+//    -Add a -v manually to the command line
+// If compiling on command line just add -keep -v options to nvcc.
+// Rebuild your solution and look in the log for these lines that follow the ptxas step:
+
+// #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
+// #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
+// #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
+
+// You just need to manually run these 3 commands (or add them to a build script)
+// after you've modified the cubin generated from the preceeding ptxas command.
+// That will give you a new .cu.obj file which will automatically be linked in for you next time you
+// build your project (or you could manually run the linker step as well).
+
+// Having done that you can call your kernel normally using the <<< >>> syntax.
+// Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
+// With fatbin you can also keep non-maxwell optimized versions of your code.
+
+
+// I just discovered this also works as a shortcut to the above:
+// nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu
+
+// The cu kernel definitions above need to have empty bodies.
+// And, the cu file must be compiled to a lib seperately before linking.
\ No newline at end of file
diff --git a/Assembler/MaxAs/sgemm/sgemm.pl b/Assembler/MaxAs/sgemm/sgemm.pl
new file mode 100644
index 0000000..9b1661b
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm.pl
@@ -0,0 +1,102 @@
+#!/usr/bin/perl
+use strict;
+
+my $CU_AD_FORMAT_UNSIGNED_INT8  = 0x01;
+my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02;
+my $CU_AD_FORMAT_FLOAT          = 0x20;
+
+if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9])
+{
+    print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`;
+    exit if $?;
+    print `maxas.pl -i sgemm128.sass sgemm.cubin`;
+    exit if $?;
+    print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`;
+}
+if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9])
+{
+    print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`;
+    exit if $?;
+    print `maxas.pl -i sgemm64.sass sgemm.cubin`;
+    exit if $?;
+    print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`;
+}
+
+#print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2);
+
+`Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`;
+
+print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`;
+exit;
+
+my %data;
+foreach my $thread128 (4 .. 64)
+{
+    my $N = $thread128 * 128;
+
+    my $iterations = int(20 * (64 * 128)**3 / $N**3);
+    $iterations = 10000 if $iterations > 10000;
+
+    print "$N $iterations\n";
+
+    my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`;
+
+    foreach my $bench (split "\n", $data)
+    {
+        if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /)
+        {
+            push @{$data{$N}}, $2;
+            print "$1 $2\n";
+        }
+    }
+}
+print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n";
+
+foreach my $N (sort { $a <=> $b } keys %data)
+{
+    print join("\t", @{$data{$N}}), "\n";
+}
+
+
+#print $data;
+
+__END__
+
+
+64 * 128 * 16 * 1.620 * .931 / 520
+
+Max64  GFLOPS: 1377.38 (size: 256, iterations: 2000)
+Max128 GFLOPS: 973.70 (size: 256, iterations: 2000)
+Cub64  GFLOPS: 1272.42 (size: 256, iterations: 2000)
+Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000)
+
+my @data = grep /\S/, split "\n", $data;
+
+my $min;
+my %smData;
+my @sdata;
+foreach (@data)
+{
+    next if /GFLOPS/;
+
+    my ($sm, $clock, $by, $bx) = split /\s+/;
+
+    $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm};
+
+    $min = $clock if !$min || $clock < $min;
+
+    push @sdata, [$sm, $clock, $by, $bx];
+}
+
+foreach (@sdata)
+{
+    $_->[1] -= $smData{$_->[0]};
+}
+
+foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata)
+{
+    printf "%02d %8u  by: %2d bx: %2d\n", @$_;
+
+}
+
+
diff --git a/Assembler/MaxAs/sgemm/sgemm.sln b/Assembler/MaxAs/sgemm/sgemm.sln
new file mode 100644
index 0000000..bcbee09
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32
+		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32
+		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Assembler/MaxAs/sgemm/sgemm.vcxproj b/Assembler/MaxAs/sgemm/sgemm.vcxproj
new file mode 100644
index 0000000..6d28ced
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm.vcxproj
@@ -0,0 +1,92 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{D571379D-3653-43CB-BE83-A6C68D392A05}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>sgemm</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="sgemm.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="sgemm128.sass" />
+    <None Include="sgemm64.sass" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/Assembler/MaxAs/sgemm/sgemm128.sass b/Assembler/MaxAs/sgemm/sgemm128.sass
new file mode 100644
index 0000000..038d2f3
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm128.sass
@@ -0,0 +1,613 @@
+# Kernel: sgemm_kernel_128
+#
+# SharedSize: 16384
+# Params(8):
+#   0:0x140:4:4 param_C,
+#   1:0x144:4:0 param_m,
+#   2:0x148:4:0 param_n,
+#   3:0x14c:4:0 param_k,
+#   4:0x150:4:0 param_lda,
+#   5:0x154:4:0 param_ldb,
+#   6:0x158:4:0 param_ldc
+#   7:0x15c:4:0 param_alpha
+#   8:0x160:4:4 param_D // for diagnostic printf output
+#
+# Globals:
+#   c[0x0][0x164]: texA (the value is 1)
+#   c[0x0][0x168]: texB (the value is 0)
+
+<REGISTER_MAPPING>
+
+    // Temporary registers to calculate the state registers. Reuse the C output registers.
+    // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts.
+    0-63    ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy
+
+    // Aliases for the C registers we use for initializing C (used as vectors)
+    0-63    : cz<00-63>
+
+    // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers
+    80      : zOffset
+
+    // 64 C maxtrix output registers.
+    // Use special mapping to avoid register bank conflicts between these registers and the blocking registers.
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
+    35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
+    39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
+    33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
+    37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
+
+    // Double buffered register blocking used in vector loads.
+    // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags
+    64-79   : j0Ax<00-03|64-67>, j0By<00-03|64-67>
+    80-95   : j1Ax<00-03|64-67>, j1By<00-03|64-67>
+
+    // Registers to load A or B
+    96-103  : loadX<0-7>
+
+    // Key global state registers for main loop and some we reuse for outputing C.
+    // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of
+    // delayed bank conflicts between memory operations and ffmas.
+    // The array index bracket notation can be used to request a bank in a dynamically allocated range.
+    104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs
+
+    // Registers to store the results back to global memory. Reuse any register not needed after the main loop.
+    // Statically allocate cs0-7 because they're vector registers.
+    64-71   : cs<0-7>
+
+    // dynamically allocated C output registers(~)
+    72-103  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX
+
+</REGISTER_MAPPING>
+
+// Note the absense of the loading of the stack pointer into R1.
+// No idea why ptxas does that anyway when it's not used for register spilling.
+// Such a waste of a perfectly good register.
+
+// Scheduler doesn't handle the dependency flags yet,
+// so move these first instructions outside the block that's auto scheduled
+//--:-:-:-:1      CS2R clock, SR_CLOCKLO;
+//--:-:-:-:1      S2R smId, SR_VIRTID;
+//--:-:-:-:1      S2R nSMs, SR_VIRTCFG;
+--:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
+--:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
+--:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
+
+// Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies
+// Memory dependencies are left up to the auther to deal with manually for now.
+<SCHEDULE_BLOCK>
+
+// First 128 threads load A to shared, 2nd 128 loads B to shared
+// Note this technique is not possible in cuda or ptx as there's no way to
+// efficiently specify a warp-uniform predicate for a memory op.
+// Compile sgemm.cu and inspect the sass to see what I'm talking about.
+
+// blk = tid >= 128 ? by   : bx;
+// ldx = tid >= 128 ? ldb  : lda;
+// tex = tid >= 128 ? texB : texA;
+01:-:-:Y:1      ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1
+06:-:-:-:1      SEL blk, by, bx, P0;               // Wait Dep 2 & 3
+--:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150];
+--:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
+--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
+--:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
+
+// Initialize the portion of shared we use to zero our C registers
+// Give each warp its own address to write to.
+// All threads write to the same address, but we don't care because only one needs to take.
+// There is no bank conflict on writing to the same address, just indeterminacy in which thread will get its value stored.
+--:-:-:-:1      LOP.AND zOffset, tid, -32;
+--:-:-:-:1      STS.128 [zOffset + 4x<16*128>], RZ;
+
+// tid4   = (tid >> 5) & 3
+// tid31  = tid & 31
+// tid96  = tid & 96
+// tid128 = tid & 128
+--:-:-:-:1      BFE.U32 tid4,   tid, 0x205; // 2 bits at position 5
+--:-:-:-:1      LOP.AND tid31,  tid, 31;
+--:-:-:-:1      LOP.AND tid96,  tid, 96;
+--:-:-:-:1      LOP.AND tid128, tid, 128;
+
+// ldx4  = ldx * 4;
+// ldx8  = ldx * 8;
+--:-:-:-:1      SHR.U32 ldx, ldx4, 2;
+--:-:-:-:1      IADD ldx8, ldx4, ldx4;
+
+// track0 = blk*128/4 + tid31 + (ldx * tid4)
+--:-:-:-:1      ISCADD  track0, blk, tid31, 5;
+--:-:-:-:1      XMAD.LO track0, ldx, tid4,  track0, xmad_t0; // XMAD.LO is a macro that is expanded out into the 3 XMADs
+--:-:-:-:1      IADD track4, track0, ldx4;
+
+// writeS  = tid31*4*4 + tid4*128*4
+// writeS += 4096 if tid >= 128
+--:-:-:-:1      SHL    tid31_4, tid31, 4;
+--:-:-:-:1      ISCADD writeS, tid4, tid31_4, 9;
+--:-:-:-:1  @P0 IADD   writeS, writeS, 4x<8*128>;
+
+// int end = track0 + (k-8)*ldx;
+--:-:-:-:1      MOV k, c[0x0][0x14c];
+--:-:-:-:1      IADD k, k, -8;
+--:-:-:-:1      XMAD.LO end, k, ldx, track0, xmad_end;
+
+// readAs and readBs are carefully constructed to avoid any bank conflicts while loading from shared
+// readAs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readAs, tid128, 4;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid7;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<8*128>, 4;
+
+// Preload the first 8 lines from texture memory
+// Keep these instructions in this order (but allow others to interleave).
+// Normally the scheduler tries to preserve source order by default, but this demonstrates how you enforce
+// an ordering if you need to.
+// Note: these are the 4 element vector load versions (last param: 0xf=vec4, 0x3=vec2, 0x1=single)
+<ORDERED>
+--:-:1:-:1      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:2:-:1      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2
+</ORDERED>
+
+</SCHEDULE_BLOCK>
+
+// Initialize C registeres to zero
+// Using LDS.U.128 is a neat trick to save a few clock cyles
+// (when you have enough warps to hide the latency.)
+<CODE>
+    return join '', map sprintf("--:-:3:-:1      LDS.U.128 cz%02d, [zOffset + 4x<16*128>];\n", $_ * 4), 0..15;
+</CODE>
+
+// These instuctions need to occur after the textures load so put them in a new block
+// that starts with a dependency barrier wait.
+<SCHEDULE_BLOCK>
+
+01:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1
+02:-:-:-:1      STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2
+
+// Increment tracks after the loads are complete to avoid needing write-after-read dependencies
+--:-:-:-:1      IADD track0, track0, ldx8;
+--:-:-:-:1      IADD track4, track4, ldx8;
+
+// Wait for all threads to finish loading shared
+04:-:-:-:5      BAR.SYNC 0;
+
+</SCHEDULE_BLOCK>
+
+// The next store to shared goes to high area.
+// Having 2 share buffers allows us to eliminate a bar.sync in the main loop.
+// This way we don't have to wait for all threads to arrive before writing fresh data to shared.
+// Other threads can continue reading from the last batch while the new data is being written.
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*128>;
+
+// Preload the fist lines of A and B from shared
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1
+
+
+// The main loop
+// While calculating the first line, load in the next line from shared.
+// Shared memory stores enough to do this 8 times per loop.
+// Also pull in the next block of memory from global and store it to shared.
+
+// Efficiency:
+// ffma: 512
+// lds:  32 dual issued
+// sts:  2  dual issued
+// tex:  2  dual issued
+// add:  2
+// xor:  3
+// setp: 1
+// bar:  1  dual issued
+// bra:  1  dual issued
+// Total: 524 (512/518 = 98.8% FFMA)
+
+// Memory Throughput Upper Bound:
+// 2 * 4 * 4 bytes per thread per 518 clocks
+// 128 threads per SM
+// 16 SM's (GM204)
+// 1640Mhz (boost overclock)
+// .931 GiB/GB  (1000^3 / 1024^3)
+// 193 GiB/sec
+// Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz)
+
+LOOP:
+
+// Loop end condition
+--:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
+
+<CODE>
+
+    # We eliminated bank conflicts with our C registers and the blocking registers,
+    # but there are still 16 bank conflicts between the blocking registers themselves.
+    # By ordering the FFMA's in a swirling zigzag pattern we can completely hide those conflicts
+    # behind register reuse.  This pattern also maximizes that reuse (47%) and minimizes the bandwidth
+    # out of the register bank, thereby reducing power consumption and allowing the chip to
+    # stay at a higher sustained clock speed.  One other constraint is that we want each successive
+    # instruction to pull its third operand from alternating banks.  We space the swirl by 2 in the x
+    # direction to achieve this.  This has the effect of making it easier to avoid delayed bank conflicts
+    # with the memory operations.  Finally, for the very first ffma, don't choose one of the 16 bank conflicts
+    # as we have no way of hiding that conflict behind a reuse (cublas makes this mistake).
+
+    # Alternating banks (1320 Hz, full speed)
+    my @swirl = ([2,0],[2,1],[0,1],[0,0]);
+    my @xVals = (0,1,64,65);
+
+    # Repeating banks (1320Hz, 83 Gflops slower, but lower power draw probably becuase of increased stalls)
+    # Only explanation I can think of is increased delayed register bank conflicts with memory ops.
+    #my @swirl = ([0,1],[0,0],[1,0],[1,1]);
+    #my @xVals = (0,2,64,66);
+
+    my @cOrder;
+    foreach my $y (0,2,64,66)
+    {
+        # apply the swirl
+        foreach my $x (@xVals)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        # apply the zigzag
+        @xVals = reverse @xVals;
+    }
+
+    # This ordering (a simple zigzag) eliminates the bank conflicts but only achieves 39% reuse.
+    # It runs 20 GFlops slower since the register bank draws more power and the clock slows down to 1306 Hz.
+    # There may be more delayed bank conflicts with memory operations as the slowdown is 4 Glops more than
+    # the reduced clock accounts for.
+    #my @cOrder2;
+    #my @xVals = (0..3,64..67);
+    #foreach my $y (0..3,64..67)
+    #{
+    #    @xVals = reverse @xVals;
+    #    push @cOrder2, [$_, $y] foreach @xVals;
+    #}
+    #@cOrder = @cOrder2;
+
+    my %insert =
+    (
+        # Don't start the first TLD before 12 to let ISETP to write P0
+        # These global reads and shared writes we put exactly in the middle of the LDS ops
+        # This is to not overwhelm the memory units with instructions (and because these were tested faster here).
+        # The 4 spacing seems to work best for vec4 instructions.
+        # It's odd that these two textures loads can drive 512 FFMA's all by themselves.. but 256 threads can load 8 128 F32 wide lines.
+        # So we only need 2 to get 8 lines from both matrices.
+
+        j0c31 => "--:-:2:-:1  \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2\n",
+        j0c33 => "--:-:3:-:1  \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3\n",
+
+        j6c30 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2\n",
+        j6c34 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3\n",
+
+        # We need one barrier in the main loop after writing shared memory.
+        # The barrier is needed even if this is our last loop because we need to protect the warp shuffle step.
+        # Note, BAR.SYNCs do not sync memory read access automatically, you still need to flag the barriers (writes are sync'd).
+        # After the BAR, swap our share buffer location.  We don't need an additional barrier because of these swaps.
+        # Note, this doubles our shared memory usage but this kernel's occupancy is entirely bound by registers.
+        # LOP.XOR readAs needs to be 4 clocks prior to the LDS.U.128 for readAs (but push this as far down as possible)
+        j6c62 =>
+                "01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<16*128>;\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<16*128>;\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<16*128>;\n",
+
+        # Note having 2 IADDs slightly hits our FFMA performance (1/518 = .2%), but TLD doesn't take an offset.
+        # LDG.CI doesn't have this issue, but doesn't give you the nice features of texture loads:
+        #   -Boundry Clamping:  simplifies our matrix load logic so we don't need to worry about loading out of bounds
+        #   -Normalized Floats: if we don't need full 32 bits of precision we could store our matrices using 16 or 8 bit values
+        j7c63 =>
+                "--:-:-:-:1  \@P0 IADD track0, track0, ldx8;\n" .
+                "--:-:-:-:0  \@P0 IADD track4, track4, ldx8;\n" .
+                "--:-:-:Y:5  \@P0 BRA LOOP;\n",
+    );
+
+    my $out;
+    # We unroll our main loop 8 iterations.
+    # This gives us a loop instruction count of 556.  Add the control instructions and that makes it 741 opcodes sized 8 bytes.
+    # This is 5928 bytes, nicely fitting inside the 8kb instruction cache.  Going to the next biggest size would be 12 lines.
+    # That would be 768 ffmas and not leaving enough room for the other instructions and control codes.
+    # So by staying inside the instruction cache size, we avoid hitting any instruction fetch latencies.
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        # Our rolling blocking registers stay one load ahead off the FFMA's (rs: read share)
+        my $rsOffset = ($j + 1) % 8;
+        # No need to load on last loop iteration
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        # You can experiment here with different vector load sizes
+        my $vec = 128;
+
+        if ($vec == 128)
+        {
+            # Roll up our LDS ops here to keep them easier to manage and tune
+            # Space at every other clock to maximize throughput.
+            $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBy64, [readBs + 4x<%d*128 + 64>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
+        }
+        elsif ($vec == 64)
+        {
+            # LDS.64 runs about 22 Gflops slower than LDS.128 (GM107).  Not a huge difference since our latencies are so well hidden.
+            # I think LDS.128 is implemented internally as a pair of LDS.64 ops which could be another reason for the comparable performance.
+            # I think the big benefit with 128 is being able to issue all our LDS ops earlier, allowing more FFMA's prior to reading out the results.
+            # There could also be additional opportunity for delayed bank conflicts.
+            $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c6"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c8"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c10"} = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c12"} = sprintf "--:-:-:-:1  %s LDS.U.64 j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c14"} = sprintf "--:-:1:-:1  %s LDS.U.64 j%dBy66, [readBs + 4x<%d*128 + 66>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
+        }
+        else
+        {
+            # This one drops performance by over 200 Gflops.  So you want to at least use LDS.64 if you can.
+            # We don't even have room to properly space these at half throuput.
+            $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c1"}  = sprintf "--:-:-:-:1  %s LDS j%dAx01, [readAs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c3"}  = sprintf "--:-:-:-:1  %s LDS j%dAx03, [readAs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c5"}  = sprintf "--:-:-:-:1  %s LDS j%dBy01, [readBs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c6"}  = sprintf "--:-:-:-:1  %s LDS j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c7"}  = sprintf "--:-:-:-:1  %s LDS j%dBy03, [readBs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c8"}  = sprintf "--:-:-:-:1  %s LDS j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c9"}  = sprintf "--:-:-:-:1  %s LDS j%dAx65, [readAs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c10"} = sprintf "--:-:-:-:1  %s LDS j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c11"} = sprintf "--:-:-:-:1  %s LDS j%dAx67, [readAs + 4x<%d*128 + 67>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c12"} = sprintf "--:-:-:-:1  %s LDS j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c13"} = sprintf "--:-:-:-:1  %s LDS j%dBy65, [readBs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c14"} = sprintf "--:-:-:-:1  %s LDS j%dBy66, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c15"} = sprintf "--:-:1:-:1  %s LDS j%dBy67, [readBs + 4x<%d*128 + 67>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
+        }
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            # Grab an instruction for insertion if one exists for this j and c combination
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            # Scatter some yields in there to better balance the workload and reduce sync stalls
+            # Don't pair a yeild with the dual issued ffmas as that kills performance for some reason
+            ##### This no longer offers extra performance on GM204 as it did on GM107.  It still does for the 64 thread version. Keeping since it doesn't hurt. ####
+            my $yield  = $c == 32 ? 'Y' : '-';
+
+            # The first FFMA needs to wait on the prior loop's LDS.U.128 ops to finish (except if the barrier does the wait for us)
+            my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--','');
+
+            # Dual issue these ops
+            my $stall  = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            # output our FFMA and also any inserted ops
+            $out .= sprintf "%s      FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $comment,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+// Main loop is done, time to write C to global memory.
+<SCHEDULE_BLOCK>
+
+// Remove the high bits if present from the last loop's xor.
+// Also remove the 4096 added onto readBs.
+// This gives us the x and y coordinates of the start of this thread's data in C.
+--:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
+
+// Remap readAs and readBs onto writeCs so we can shuffle the output for coalesced global writes.
+// readAs stays constant, readBs colapses down from stride 4 to 1
+// writeCs = (readBs / 4) * 128 + readAs;
+--:-:-:-:1      ISCADD  writeCs, readBs, readAs, 5;
+
+// Read out the C values from shared in a simple tid mapped pattern but
+// offset by the position of this warp's colapsed data in shared.
+
+// cx = tid31 | (tid128 >> 2);
+--:-:-:-:1      SHR.U32  cx, tid128, 2;
+--:-:-:-:1      LOP.OR   cx, tid31,  cx;
+
+// readCs = ((tid96 << 4) | cx) << 2;
+--:-:-:-:1      SHL      readCs, tid96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, cx;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// cx += bx*128;
+--:-:-:-:1      ISCADD  cx, bx, cx, 7;
+
+// cy = by*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, by, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+--:-:-:-:1      MOV ldc, c[0x0][0x158];
+--:-:-:-:1      XMAD.LO ci, cy00, ldc, cx, xmad_ci;
+--:-:-:-:1      ISCADD  Cy00, ci, c[0x0][0x140], 2;
+
+// When writing in assembly, being able to 'printf' is sometimes easier than stepping through the debugger.
+// Here's how it's done.  Drop something like this in your code. Then modify the c code to accept this
+// many params per thread to printf (see assemblySgemm function).
+
+//--:-:-:-:1      SHR.U32  smId, smId, 20;
+
+// D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4
+// D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4
+//--:-:-:-:1      MOV gridDimX, c[0x0][0x14];
+//--:-:-:-:1      MOV blckDimX, c[0x0][0x8];
+//--:-:-:-:1      XMAD.LO D, by, gridDimX, bx, xmad_D;
+//--:-:-:-:1      XMAD.LO D, D, blckDimX, tid, xmad_D;
+//--:-:-:-:1      ISCADD D, D, c[0x0][0x160], 3; // 4 bytes * 2 vars = 8 or shift 3
+
+//--:-:-:-:1      STG.CS [D + 4x<0>], readAs;
+//--:-:-:-:1      STG.CS [D + 4x<1>], readBs;
+//--:-:-:-:1      STG.CS [D + 4x<2>], writeCs;
+//--:-:-:-:1      STG.CS [D + 4x<3>], readCs;
+//--:-:-:-:1      STG.CS [D + 4x<4>], cx;
+//--:-:-:-:1      STG.CS [D + 4x<5>], cy00;
+//--:-:-:-:1      STG.CS [D + 4x<6>], ci;
+//--:-:-:-:1      STG.CS [D + 4x<7>], cx67y67;
+
+//--:-:-:-:1      STG.CS [D + 4x<0>], smId;
+//--:-:-:-:1      STG.CS [D + 4x<1>], clock;
+
+
+// Setup our matrix bounds checking vars and preds
+// Bounds checking is what allows this code to work on matrix sizes not a multiple of 128
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
+--:-:-:-:1      IADD cx, cx, 64;
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
+
+--:-:-:-:1      IADD cy00, cy00, -1;
+--:-:-:-:1      IADD cy04, cy00,  4;
+--:-:-:-:1      IADD cy08, cy00,  8;
+--:-:-:-:1      IADD cy12, cy00,  12;
+
+// Setup our C output addresses and increments.
+--:-:-:-:1      SHL  ldc1,  ldc, 2;
+--:-:-:-:1      SHL  ldc4,  ldc, 4;
+--:-:-:-:1      SHL  ldc8,  ldc, 5;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+
+// Load the first set of the STORE_C subroutine params in the scheduled block.
+# This is also a good time to apply alpha.
+--:-:-:-:1      MOV alpha, c[0x0][0x15c];
+
+--:-:-:-:1      FMUL cs0, cx00y00, alpha;
+--:-:-:-:1      FMUL cs1, cx01y00, alpha;
+--:-:-:-:1      FMUL cs2, cx02y00, alpha;
+--:-:-:-:1      FMUL cs3, cx03y00, alpha;
+--:-:-:-:1      FMUL cs4, cx64y00, alpha;
+--:-:-:-:1      FMUL cs5, cx65y00, alpha;
+--:-:-:-:1      FMUL cs6, cx66y00, alpha;
+--:-:-:-:1      FMUL cs7, cx67y00, alpha;
+
+// We pre-increment the output addresses so they can be dual issued with memory ops
+// So start with a -1 instead of 0 value.
+--:-:-:-:1      IADD Cy00, Cy00, -ldc1;
+--:-:-:-:1      IADD Cy04, Cy00, ldc4;
+--:-:-:-:1      IADD Cy08, Cy00, ldc8;
+--:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
+
+</SCHEDULE_BLOCK>
+
+// There's nothing yet in place to handle dependecies with subroutines.
+// So don't schedule this block.
+<CODE>
+
+    my $out;
+    foreach my $y (0..3, 64..67)
+    {
+        my ($wait, $comment) = $y == 64 ? ('--', '') : ('02',' // Wait Dep 2');
+
+        # Jump ahead 60 units (to get to the values at y=64)
+        $out .=
+            "--:-:-:-:1      IADD cy00, cy00, 60;\n" .
+            "--:-:-:-:1      IADD cy04, cy04, 60;\n" .
+            "--:-:-:-:1      IADD cy08, cy08, 60;\n" .
+            "--:-:-:-:1      IADD cy12, cy12, 60;\n\n" .
+
+            "02:-:-:-:1      IADD Cy00, Cy00, ldc60; // Wait Dep 2\n" .
+            "--:-:-:-:1      IADD Cy04, Cy04, ldc60;\n" .
+            "--:-:-:-:1      IADD Cy08, Cy08, ldc60;\n" .
+            "--:-:-:-:1      IADD Cy12, Cy12, ldc60;\n\n"  if $y == 64;
+
+        # We need to move the C values to the param registers of the STORE_C subroutine.
+        # This is also a good time to apply alpha.
+        $out .= sprintf(
+            "%s:-:-:-:1      FMUL cs0, cx00y%02d, alpha;%s\n" .
+            "--:-:-:-:1      FMUL cs1, cx01y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs2, cx02y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs3, cx03y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs4, cx64y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs5, cx65y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs6, cx66y%02d, alpha;\n" .
+            "--:-:-:-:0      FMUL cs7, cx67y%02d, alpha; // Dual Issue\n",
+            $wait, $y, $comment, ($y) x 7) if $y;
+
+        # Call the subroutine.
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+// And we'd done.  The remainder is the STORE_C subroutine that's defined at the end of the kernel.
+--:-:-:-:5      EXIT;
+
+// This routine does warp synchronous shuffling of our output data so as to be able
+// to have coalesced writes to global memory.  This is actually faster because the shared
+// memory latencies can be hidden by other warps and we're only adding a few extra clocks
+// to this thread.  Global memory here is the bottleneck and being able to half the needed
+// bandwidth at the expense of a few clocks is a modest win.  This also keeps power lower
+// and our chip running faster.
+
+// Note, the SHFL instruction doesn't help us here because we're swaping different registers
+// from different threads.
+STORE_C:
+
+<SCHEDULE_BLOCK>
+
+// Each warp writes to its own region of memory so we don't need to bar.sync the access.
+// There are some bank conflicts here on the STS.128s but no way to avoid them, and the hit just means a few extra clocks.
+// Note here that the scheduler is able to handle the dependencies between vector and non-vector instructions.
+// It knows from the instruction type and the register map that cs0 here includes cs1, cs2 and cs3 as well.
+--:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], cs4;
+
+// In a single warp, loads naturally occur after the store to shared completes, no sync required.
+--:-:-:-:1      LDS cs0, [readCs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS cs1, [readCs + 4x<0*128 + 64>];
+--:-:-:-:1      LDS cs2, [readCs + 4x<1*128 + 00>];
+--:-:-:-:1      LDS cs3, [readCs + 4x<1*128 + 64>];
+--:-:-:-:1      LDS cs4, [readCs + 4x<2*128 + 00>];
+--:-:-:-:1      LDS cs5, [readCs + 4x<2*128 + 64>];
+--:-:-:-:1      LDS cs6, [readCs + 4x<3*128 + 00>];
+--:-:1:-:1      LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:1      IADD cy12, cy12, 1;
+
+--:-:-:-:1      IADD Cy00, Cy00, ldc1;
+--:-:-:-:1      IADD Cy04, Cy04, ldc1;
+--:-:-:-:1      IADD Cy08, Cy08, ldc1;
+--:-:-:-:1      IADD Cy12, Cy12, ldc1;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m
+
+01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
+--:-:-:-:1  @P1 STG.CG [Cy00 + 4x<64>], cs1;
+--:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
+--:-:-:-:1  @P3 STG.CG [Cy04 + 4x<64>], cs3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m
+
+--:-:-:-:1  @P0 STG.CG [Cy08 + 4x<00>], cs4;
+--:-:-:-:1  @P1 STG.CG [Cy08 + 4x<64>], cs5;
+--:-:-:-:1  @P2 STG.CG [Cy12 + 4x<00>], cs6;
+--:2:-:-:1  @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
diff --git a/Assembler/MaxAs/sgemm/sgemm64.sass b/Assembler/MaxAs/sgemm/sgemm64.sass
new file mode 100644
index 0000000..f037b3e
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm64.sass
@@ -0,0 +1,398 @@
+# Kernel: sgemm_kernel_64
+#
+# SharedSize: 8192
+# Params(8):
+#   0:0x140:4:4 param_C,
+#   1:0x144:4:0 param_m,
+#   2:0x148:4:0 param_n,
+#   3:0x14c:4:0 param_k,
+#   4:0x150:4:0 param_lda,
+#   5:0x154:4:0 param_ldb,
+#   6:0x158:4:0 param_ldc
+#   7:0x15c:4:0 param_alpha
+#   8:0x160:4:4 param_D // for diagnostic printf output
+#
+# Globals:
+#   c[0x0][0x164]: texA (the value is 1)
+#   c[0x0][0x168]: texB (the value is 0)
+
+<REGISTER_MAPPING>
+
+    0-63    ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end
+
+    80      : zOffset
+    0-63    : cz<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35>
+    35,34,43,42,51,50,59,58 : cx32y<00-03|32-35>
+    39,38,47,46,55,54,63,62 : cx33y<00-03|32-35>
+    33,32,41,40,49,48,57,56 : cx34y<00-03|32-35>
+    37,36,45,44,53,52,61,60 : cx35y<00-03|32-35>
+
+    64-79   : j0Ax<00-03|32-35>, j0By<00-03|32-35>
+    80-95   : j1Ax<00-03|32-35>, j1By<00-03|32-35>
+
+    64-71   : cs<0-7>
+
+    96-111  : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3>
+
+    112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32
+
+    72-111  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
+--:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
+--:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
+
+<SCHEDULE_BLOCK>
+
+// blk = tid >= 32 ? by   : bx;
+// ldx = tid >= 32 ? ldb  : lda;
+// tex = tid >= 32 ? texB : texA;
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1
+06:-:-:-:1      SEL blk, by, bx, P0;              // Wait Dep 2 & 3
+--:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150];
+--:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
+--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
+--:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
+
+--:-:-:-:1      LOP.AND zOffset, tid, -32;
+--:-:-:-:1      STS.128 [zOffset + 4x<16*64>], RZ;
+
+// tid2   = (tid >> 4) & 1
+// tid15  = tid & 15
+// tid31 = tid & 31
+// tid32 = tid & 32
+--:-:-:-:1      BFE.U32 tid2,  tid, 0x104; // 1 bit at position 4
+--:-:-:-:1      LOP.AND tid15, tid, 15;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      LOP.AND tid32, tid, 32;
+
+// ldx4  = ldx * 4;
+// ldx8  = ldx * 8;
+--:-:-:-:1      SHR.U32 ldx, ldx4, 2;
+--:-:-:-:1      IADD ldx8, ldx4, ldx4;
+
+// track0 = blk*64/4 + tid15 + (ldx * tid2)
+--:-:-:-:1      ISCADD  track0, blk, tid15, 4;
+--:-:-:-:1      XMAD.LO track0, ldx, tid2,  track0, xmad_t0;
+--:-:-:-:1      IADD3 track2, track0, ldx, ldx;
+--:-:-:-:1      IADD track4, track0, ldx4;
+--:-:-:-:1      IADD track6, track2, ldx4;
+
+// writeS = tid15*4*4 + tid2*64*4
+--:-:-:-:1      SHL    tid15_4, tid15, 4;
+--:-:-:-:1      ISCADD writeS, tid2, tid15_4, 8;
+
+// writeS += 2048 if tid >= 32
+--:-:-:-:1  @P0 IADD   writeS, writeS, 4x<8*64>;
+
+// int end = track0 + (k-8)*ldx;
+--:-:-:-:1      MOV k, c[0x0][0x14c];
+--:-:-:-:1      IADD k, k, -8;
+--:-:-:-:1      XMAD.LO end, k, ldx, track0, xmad_end;
+
+// readAs = ((tid >> 1) & 7) << 4;
+--:-:-:-:1      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 2048;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x30;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<8*64>, 4;
+
+<ORDERED>
+--:-:1:-:1      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:2:-:1      TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:3:-:1      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:4:-:1      TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2
+</ORDERED>
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    return join '', map sprintf("--:-:5:-:1      LDS.U.128 cz%02d, [zOffset + 4x<16*64>];\n", $_ * 4), 0..15;
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+01:-:-:-:1      STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1
+02:-:-:-:1      STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2
+04:-:-:-:1      STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3
+08:-:-:-:1      STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4
+
+--:-:-:-:1      IADD track0, track0, ldx8;
+--:-:-:-:1      IADD track2, track2, ldx8;
+--:-:-:-:1      IADD track4, track4, ldx8;
+--:-:-:-:1      IADD track6, track6, ldx8;
+
+10:-:-:-:5      BAR.SYNC 0;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*64>;
+
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1
+
+// Efficiency:
+// ffma: 512
+// lds:  32 dual issued
+// sts:  4  dual issued
+// tex:  4  dual issued
+// add:  4
+// xor:  3
+// setp: 1
+// bar:  1  dual issued
+// bra:  1  dual issued
+// Total: 520 (512/520 = 98.5% FFMA)
+
+LOOP:
+
+// Loop end condition
+--:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
+
+<CODE>
+
+    my @cOrder;
+    my @swirl = ([2,0],[2,1],[0,1],[0,0]);
+    my @x = (0,1,32,33);
+    foreach my $y (0,2,32,34)
+    {
+        foreach my $x (@x)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @x = reverse @x;
+    }
+
+    my %insert =
+    (
+        j0c31 => "--:-:-:-:1  \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf;\n",
+        j0c33 => "--:-:2:-:1  \@P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2\n",
+
+        j1c31 => "--:-:-:-:1  \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf;\n",
+        j1c33 => "--:-:3:-:1  \@P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3\n",
+
+        j5c30 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2\n",
+        j5c34 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<2*64>], loadX2;\n",
+
+        j6c30 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3\n",
+        j6c34 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*64>], loadX6;\n",
+
+        j6c62 =>
+                "01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<16*64>;\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<16*64>;\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<16*64>;\n",
+
+        j7c63 =>
+                "--:-:-:-:1  \@P0 IADD track0, track0, ldx8;\n" .
+                "--:-:-:-:1  \@P0 IADD track2, track2, ldx8;\n" .
+                "--:-:-:-:1  \@P0 IADD track4, track4, ldx8;\n" .
+                "--:-:-:-:0  \@P0 IADD track6, track6, ldx8;\n" .
+                "--:-:-:Y:5  \@P0 BRA LOOP;\n",
+    );
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx00, [readAs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBy00, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx32, [readAs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBy32, [readBs + 4x<%d*64 + 32>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $yield  = $c == 32 ? 'Y' : '-';
+
+            my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--','');
+
+            my $stall  = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $comment,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND readAs, readAs, 0x7ff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0x7ff;
+
+// writeCs = (readBs / 4) * 64 + readAs;
+--:-:-:-:1      ISCADD  writeCs, readBs, readAs, 4;
+
+// readCs = ((tid32 << 3) + tid31) << 2;
+--:-:-:-:1      ISCADD  readCs, tid32,  tid31, 3;
+--:-:-:-:1      SHL     readCs, readCs, 2;
+
+// cx = bx*64 + tid31;
+--:-:-:-:1      ISCADD  cx, bx, tid31, 6;
+
+// cy = by*64 + (tid32 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid32, 1;
+--:-:-:-:1      ISCADD  cy00, by, cy00, 6;
+
+// C += (cy*ldc + cx) * 4;
+--:-:-:-:1      MOV ldc, c[0x0][0x158];
+--:-:-:-:1      XMAD.LO ci, cy00, ldc, cx, xmad_ci;
+--:-:-:-:1      ISCADD  Cy00, ci, c[0x0][0x140], 2;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
+--:-:-:-:1      IADD cx, cx, 32;
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
+
+// D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4
+// D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4
+//--:-:-:-:1      MOV gridDimX, c[0x0][0x14];
+//--:-:-:-:1      MOV blckDimX, c[0x0][0x8];
+//--:-:-:-:1      XMAD.LO D, by, gridDimX, bx, xmadD;
+//--:-:-:-:1      XMAD.LO D, D, blckDimX, tid, xmadD;
+//--:-:-:-:1      ISCADD D, D, c[0x0][0x160], 5; // 4 bytes * 8 vars = 32 or shift 5
+
+//--:-:-:-:1      STG.CS [D + 4x<0>], readAs;
+//--:-:-:-:1      STG.CS [D + 4x<1>], readBs;
+//--:-:-:-:1      STG.CS [D + 4x<2>], writeCs;
+//--:-:-:-:1      STG.CS [D + 4x<3>], readCs;
+//--:-:-:-:1      STG.CS [D + 4x<4>], cx;
+//--:-:-:-:1      STG.CS [D + 4x<5>], cy00;
+//--:-:-:-:1      STG.CS [D + 4x<6>], ci;
+//--:-:-:-:1      STG.CS [D + 4x<7>], cx35y35;
+
+--:-:-:-:1      IADD cy00, cy00, -1;
+--:-:-:-:1      IADD cy04, cy00,  4;
+--:-:-:-:1      IADD cy08, cy00,  8;
+--:-:-:-:1      IADD cy12, cy00,  12;
+
+--:-:-:-:1      SHL  ldc1,  ldc, 2;
+--:-:-:-:1      SHL  ldc4,  ldc, 4;
+--:-:-:-:1      SHL  ldc8,  ldc, 5;
+--:-:-:-:1      ISCADD ldc28, ldc, -ldc4, 7;
+
+--:-:-:-:1      MOV alpha, c[0x0][0x15c];
+--:-:-:-:1      FMUL cs0, cx00y00, alpha;
+--:-:-:-:1      FMUL cs1, cx01y00, alpha;
+--:-:-:-:1      FMUL cs2, cx02y00, alpha;
+--:-:-:-:1      FMUL cs3, cx03y00, alpha;
+--:-:-:-:1      FMUL cs4, cx32y00, alpha;
+--:-:-:-:1      FMUL cs5, cx33y00, alpha;
+--:-:-:-:1      FMUL cs6, cx34y00, alpha;
+--:-:-:-:1      FMUL cs7, cx35y00, alpha;
+
+--:-:-:-:1      IADD Cy00, Cy00, -ldc1;
+--:-:-:-:1      IADD Cy04, Cy00, ldc4;
+--:-:-:-:1      IADD Cy08, Cy00, ldc8;
+--:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..3, 32..35)
+    {
+        my ($wait, $comment) = $y == 32 ? ('--', '') : ('02',' // Wait Dep 2');
+
+        $out .=
+            "--:-:-:-:1      IADD cy00, cy00, 28;\n" .
+            "--:-:-:-:1      IADD cy04, cy04, 28;\n" .
+            "--:-:-:-:1      IADD cy08, cy08, 28;\n" .
+            "--:-:-:-:1      IADD cy12, cy12, 28;\n\n" .
+
+            "02:-:-:-:1      IADD Cy00, Cy00, ldc28; // Wait Dep 2\n" .
+            "--:-:-:-:1      IADD Cy04, Cy04, ldc28;\n" .
+            "--:-:-:-:1      IADD Cy08, Cy08, ldc28;\n" .
+            "--:-:-:-:1      IADD Cy12, Cy12, ldc28;\n\n"  if $y == 32;
+
+        $out .= sprintf(
+            "%s:-:-:-:1      FMUL cs0, cx00y%02d, alpha;%s\n" .
+            "--:-:-:-:1      FMUL cs1, cx01y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs2, cx02y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs3, cx03y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs4, cx32y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs5, cx33y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs6, cx34y%02d, alpha;\n" .
+            "--:-:-:-:0      FMUL cs7, cx35y%02d, alpha; // Dual Issue\n",
+            $wait, $y, $comment, ($y) x 7) if $y;
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], cs4;
+
+--:-:-:-:1      LDS cs0, [readCs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS cs1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:1      LDS cs2, [readCs + 4x<1*64 + 00>];
+--:-:-:-:1      LDS cs3, [readCs + 4x<1*64 + 32>];
+--:-:-:-:1      LDS cs4, [readCs + 4x<2*64 + 00>];
+--:-:-:-:1      LDS cs5, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      LDS cs6, [readCs + 4x<3*64 + 00>];
+--:-:1:-:1      LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:1      IADD cy12, cy12, 1;
+
+--:-:-:-:1      IADD Cy00, Cy00, ldc1;
+--:-:-:-:1      IADD Cy04, Cy04, ldc1;
+--:-:-:-:1      IADD Cy08, Cy08, ldc1;
+--:-:-:-:1      IADD Cy12, Cy12, ldc1;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m
+
+01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
+--:-:-:-:1  @P1 STG.CG [Cy00 + 4x<32>], cs1;
+--:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
+--:-:-:-:1  @P3 STG.CG [Cy04 + 4x<32>], cs3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m
+
+--:-:-:-:1  @P0 STG.CG [Cy08 + 4x<00>], cs4;
+--:-:-:-:1  @P1 STG.CG [Cy08 + 4x<32>], cs5;
+--:-:-:-:1  @P2 STG.CG [Cy12 + 4x<00>], cs6;
+--:2:-:-:1  @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
diff --git a/Assembler/MaxAs/sgemm/sgemm_final_128.sass b/Assembler/MaxAs/sgemm/sgemm_final_128.sass
new file mode 100644
index 0000000..ce7b0e7
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm_final_128.sass
@@ -0,0 +1,793 @@
+# Kernel: sgemm_kernel_128
+# Arch: sm_50
+# InsCnt: 770
+# RegCnt: 118
+# SharedSize: 16384
+# BarCnt: 1
+# Params(9):
+#	ord:addr:size:align
+#	0:0x140:4:0
+#	1:0x144:4:0
+#	2:0x148:4:0
+#	3:0x14c:4:0
+#	4:0x150:4:0
+#	5:0x154:4:0
+#	6:0x158:4:0
+#	7:0x15c:4:0
+#	8:0x160:4:0
+#
+# Instructions:
+
+--:-:1:-:1      S2R R112, SR_TID.X;
+--:-:2:-:1      S2R R113, SR_CTAID.X;
+--:-:3:-:1      S2R R114, SR_CTAID.Y;
+01:-:-:Y:1      ISETP.GE.AND P0, PT, R112.reuse, 0x80, PT;
+--:-:-:-:1      LOP.AND R117, R112.reuse, 0x1f;
+--:-:-:-:1      BFE.U32 R9, R112.reuse, 0x205;
+--:-:-:-:1      MOV R13, c[0x0][0x14c];
+--:-:-:-:1      BFE.U32 R4, R112.reuse, 0x301;
+--:-:-:-:1      LOP.AND R115, R112.reuse, 0x80;
+--:-:-:-:1      LOP.AND R107, R112.reuse, 0x70;
+--:-:-:-:1      SHL R16, R117, 0x4;
+--:-:-:-:1      LOP.AND R0, R112.reuse, 0x1;
+--:-:-:-:1      IADD R13, R13, -0x8;
+--:-:-:-:1      LOP.AND R80, R112.reuse, -0x20;
+--:-:-:-:1      SHR.U32 R106, R115, 0x4;
+--:-:-:-:1      LOP.AND R116, R112, 0x60;
+--:-:-:-:1      SHR.U32 R107, R107, 0x3;
+--:-:-:-:0 @!P0 MOV R1, c[0x0][0x150];
+--:-:-:-:1      STS.128 [R80+0x2000], RZ;
+--:-:-:-:1  @P0 MOV R1, c[0x0][0x154];
+--:-:-:-:1      ISCADD R111, R9, R16, 0x9;
+06:-:-:-:1      SEL R12, R114, R113, P0;
+--:-:-:-:1 @!P0 MOV32I R110, 0x80000001;
+--:-:-:-:1  @P0 MOV32I R110, 0x80000000;
+--:-:-:-:1      LOP.OR R106, R106, R4;
+--:-:-:-:1      SHR.U32 R8, R1.reuse, 0x2;
+--:-:-:-:1      LOP.OR R107, R107, R0;
+--:-:-:-:1      ISCADD R104, R12, R117, 0x5;
+--:-:-:-:1      IADD R109, R1, R1;
+--:-:-:-:1  @P0 IADD R111, R111, 0x1000;
+--:-:-:-:1      SHL R106, R106, 0x4;
+--:-:-:-:1      XMAD.MRG R5, R8.reuse, R9.H1.reuse, RZ;
+--:-:-:-:1      ISCADD R107, R107, 0x1000, 0x4;
+--:-:-:-:1      XMAD R104, R8.reuse, R9, R104;
+--:-:-:Y:5      XMAD.MRG R20, R13.reuse, R8.H1.reuse, RZ;
+--:-:-:-:2      XMAD.PSL.CBCC R104, R8.H1, R5.H1, R104;
+--:-:1:-:4      TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf;
+--:-:-:-:1      IADD R108, R104, R1;
+--:-:-:-:1      XMAD R105, R13.reuse, R8, R104;
+--:-:2:Y:5      TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf;
+--:-:-:-:1      XMAD.PSL.CBCC R105, R13.H1, R20.H1, R105;
+--:-:3:-:1      LDS.U.128 R0, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R4, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R8, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R12, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R16, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R20, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R24, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R28, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R32, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R36, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R40, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R44, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R48, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R52, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R56, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R60, [R80+0x2000];
+01:-:-:-:1      STS.128 [R111], R96;
+--:-:-:-:0      IADD R104, R104, R109.reuse;
+02:-:-:-:1      STS.128 [R111+0x800], R100;
+--:-:-:-:0      IADD R108, R108, R109;
+04:-:-:-:5      BAR.SYNC 0x0;
+--:-:-:-:0      LOP.XOR R111, R111, 0x2000;
+--:-:-:-:1      LDS.U.128 R64, [R106];
+--:-:-:-:1      LDS.U.128 R72, [R107];
+--:-:-:-:1      LDS.U.128 R68, [R106+0x100];
+--:-:1:-:1      LDS.U.128 R76, [R107+0x100];
+TARGET1:
+--:-:-:-:1      ISETP.LE.AND P0, PT, R104, R105, PT;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R106+0x200];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R107+0x200];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R106+0x300];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R107+0x300];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:0      FFMA R11, R64.reuse, R74, R11;
+--:-:2:-:1  @P0 TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:0      FFMA R16, R66, R77.reuse, R16;
+--:-:3:-:1  @P0 TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R106+0x400];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R107+0x400];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R106+0x500];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R107+0x500];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R106+0x600];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R107+0x600];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R106+0x700];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R107+0x700];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R106+0x800];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R107+0x800];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R106+0x900];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R107+0x900];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R106+0xa00];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R107+0xa00];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R106+0xb00];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R107+0xb00];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R106+0xc00];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R107+0xc00];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R106+0xd00];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R107+0xd00];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R106+0xe00];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R107+0xe00];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R106+0xf00];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R107+0xf00];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:0      FFMA R10, R64.reuse, R75, R10;
+02:-:-:-:1  @P0 STS.128 [R111], R96;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:0      FFMA R18, R64.reuse, R77.reuse, R18;
+04:-:-:-:1  @P0 STS.128 [R111+0x800], R100;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:0      FFMA R26, R64.reuse, R79, R26;
+01:-:-:-:5      BAR.SYNC 0x0;
+--:-:-:-:1  @P0 LOP.XOR R106, R106, 0x2000;
+--:-:-:-:1  @P0 LOP.XOR R107, R107, 0x2000;
+--:-:-:-:1  @P0 LOP.XOR R111, R111, 0x2000;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+--:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1  @P0 LDS.U.128 R64, [R106];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1  @P0 LDS.U.128 R72, [R107];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1  @P0 LDS.U.128 R68, [R106+0x100];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1  @P0 LDS.U.128 R76, [R107+0x100];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+--:-:-:-:1  @P0 IADD R104, R104, R109.reuse;
+--:-:-:-:0  @P0 IADD R108, R108, R109;
+--:-:-:Y:5  @P0 BRA TARGET1;
+--:-:-:-:1      SHR.U32 R84, R115, 0x2;
+--:-:-:-:1      MOV R77, c[0x0][0x158];
+--:-:-:-:1      SHR.U32 R80, R116.reuse, 0x1;
+--:-:-:-:1      MOV R72, c[0x0][0x15c];
+--:-:-:-:1      SHL R89, R116, 0x4;
+--:-:-:-:1      LOP.AND R106, R106, 0xfff;
+--:-:-:-:1      LOP.OR R84, R117, R84;
+--:-:-:-:1      SHL R81, R77.reuse, 0x2;
+--:-:-:-:1      LOP.AND R107, R107, 0xfff;
+--:-:-:-:1      ISCADD R80, R114, R80, 0x7;
+--:-:-:-:1      FMUL R64, R3, R72.reuse;
+--:-:-:-:1      SHL R74, R77.reuse, 0x4;
+--:-:-:-:1      LOP.OR R89, R89, R84;
+--:-:-:-:1      ISCADD R84, R113, R84, 0x7;
+--:-:-:-:1      FMUL R65, R7, R72.reuse;
+--:-:-:-:1      SHL R88, R77, 0x5;
+--:-:-:-:1      XMAD.MRG R78, R80.reuse, R77.H1.reuse, RZ;
+--:-:-:-:1      ISCADD R90, R107, R106, 0x5;
+--:-:-:-:1      FMUL R66, R1, R72.reuse;
+--:-:-:-:1      SHL R89, R89, 0x2;
+--:-:-:-:1      XMAD R73, R80, R77, R84;
+--:-:-:-:1      ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT;
+--:-:-:-:1      IADD R84, R84, 0x40;
+--:-:-:-:1      ISCADD R85, R77, -R74, 0x8;
+--:-:-:-:1      FMUL R67, R5, R72.reuse;
+--:-:-:-:1      FMUL R68, R35, R72.reuse;
+--:-:-:-:1      XMAD.PSL.CBCC R73, R80.H1, R78.H1, R73;
+--:-:-:-:1      IADD R80, R80, -0x1;
+--:-:-:-:1      ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT;
+--:-:-:-:1      FMUL R69, R39, R72.reuse;
+--:-:-:-:1      FMUL R70, R33, R72.reuse;
+--:-:-:-:1      FMUL R71, R37, R72;
+--:-:-:-:1      ISCADD R76, R73, c[0x0][0x140], 0x2;
+--:-:-:-:1      IADD R83, R80.reuse, 0x4;
+--:-:-:-:1      IADD R86, R80.reuse, 0x8;
+--:-:-:-:3      IADD R87, R80, 0xc;
+--:-:-:Y:6      IADD R76, R76, -R81;
+--:-:-:-:1      IADD R75, R76.reuse, R74;
+--:-:-:Y:5      IADD R79, R76, R88.reuse;
+--:-:-:-:0      IADD R82, R75, R88;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R2, R72.reuse;
+--:-:-:-:1      FMUL R65, R6, R72.reuse;
+--:-:-:-:1      FMUL R66, R0, R72.reuse;
+--:-:-:-:1      FMUL R67, R4, R72.reuse;
+--:-:-:-:1      FMUL R68, R34, R72.reuse;
+--:-:-:-:1      FMUL R69, R38, R72.reuse;
+--:-:-:-:1      FMUL R70, R32, R72.reuse;
+--:-:-:-:0      FMUL R71, R36, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R11, R72.reuse;
+--:-:-:-:1      FMUL R65, R15, R72.reuse;
+--:-:-:-:1      FMUL R66, R9, R72.reuse;
+--:-:-:-:1      FMUL R67, R13, R72.reuse;
+--:-:-:-:1      FMUL R68, R43, R72.reuse;
+--:-:-:-:1      FMUL R69, R47, R72.reuse;
+--:-:-:-:1      FMUL R70, R41, R72.reuse;
+--:-:-:-:0      FMUL R71, R45, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R10, R72.reuse;
+--:-:-:-:1      FMUL R65, R14, R72.reuse;
+--:-:-:-:1      FMUL R66, R8, R72.reuse;
+--:-:-:-:1      FMUL R67, R12, R72.reuse;
+--:-:-:-:1      FMUL R68, R42, R72.reuse;
+--:-:-:-:1      FMUL R69, R46, R72.reuse;
+--:-:-:-:1      FMUL R70, R40, R72.reuse;
+--:-:-:-:0      FMUL R71, R44, R72;
+--:-:-:-:5      CAL TARGET2;
+--:-:-:-:1      IADD R80, R80, 0x3c;
+--:-:-:-:1      IADD R83, R83, 0x3c;
+--:-:-:-:1      IADD R86, R86, 0x3c;
+--:-:-:-:1      IADD R87, R87, 0x3c;
+02:-:-:-:1      IADD R76, R76, R85.reuse;
+--:-:-:-:1      IADD R75, R75, R85.reuse;
+--:-:-:-:1      IADD R79, R79, R85.reuse;
+--:-:-:-:1      IADD R82, R82, R85;
+--:-:-:-:1      FMUL R64, R19, R72.reuse;
+--:-:-:-:1      FMUL R65, R23, R72.reuse;
+--:-:-:-:1      FMUL R66, R17, R72.reuse;
+--:-:-:-:1      FMUL R67, R21, R72.reuse;
+--:-:-:-:1      FMUL R68, R51, R72.reuse;
+--:-:-:-:1      FMUL R69, R55, R72.reuse;
+--:-:-:-:1      FMUL R70, R49, R72.reuse;
+--:-:-:-:0      FMUL R71, R53, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R18, R72.reuse;
+--:-:-:-:1      FMUL R65, R22, R72.reuse;
+--:-:-:-:1      FMUL R66, R16, R72.reuse;
+--:-:-:-:1      FMUL R67, R20, R72.reuse;
+--:-:-:-:1      FMUL R68, R50, R72.reuse;
+--:-:-:-:1      FMUL R69, R54, R72.reuse;
+--:-:-:-:1      FMUL R70, R48, R72.reuse;
+--:-:-:-:0      FMUL R71, R52, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R27, R72.reuse;
+--:-:-:-:1      FMUL R65, R31, R72.reuse;
+--:-:-:-:1      FMUL R66, R25, R72.reuse;
+--:-:-:-:1      FMUL R67, R29, R72.reuse;
+--:-:-:-:1      FMUL R68, R59, R72.reuse;
+--:-:-:-:1      FMUL R69, R63, R72.reuse;
+--:-:-:-:1      FMUL R70, R57, R72.reuse;
+--:-:-:-:0      FMUL R71, R61, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R26, R72.reuse;
+--:-:-:-:1      FMUL R65, R30, R72.reuse;
+--:-:-:-:1      FMUL R66, R24, R72.reuse;
+--:-:-:-:1      FMUL R67, R28, R72.reuse;
+--:-:-:-:1      FMUL R68, R58, R72.reuse;
+--:-:-:-:1      FMUL R69, R62, R72.reuse;
+--:-:-:-:1      FMUL R70, R56, R72.reuse;
+--:-:-:-:0      FMUL R71, R60, R72;
+--:-:-:-:5      CAL TARGET2;
+--:-:-:-:5      EXIT;
+TARGET2:
+--:-:-:-:0      IADD R80, R80, 0x1;
+--:-:-:-:1      STS.128 [R90], R64;
+--:-:-:-:0      IADD R83, R83, 0x1;
+--:-:-:-:1      STS.128 [R90+0x100], R68;
+--:-:-:-:0      IADD R86, R86, 0x1;
+--:-:-:-:1      LDS R64, [R89];
+--:-:-:-:0      IADD R87, R87, 0x1;
+--:-:-:-:1      LDS R65, [R89+0x100];
+--:-:-:-:0      IADD R76, R76, R81.reuse;
+--:-:-:-:1      LDS R66, [R89+0x200];
+--:-:-:-:0      IADD R75, R75, R81.reuse;
+--:-:-:-:1      LDS R67, [R89+0x300];
+--:-:-:-:0      IADD R79, R79, R81.reuse;
+--:-:-:-:1      LDS R68, [R89+0x400];
+--:-:-:-:0      IADD R82, R82, R81;
+--:-:-:-:1      LDS R69, [R89+0x500];
+--:-:-:-:1      ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5;
+--:-:-:-:1      LDS R70, [R89+0x600];
+--:-:-:-:1      ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6;
+--:-:1:-:1      LDS R71, [R89+0x700];
+--:-:-:-:2      ISETP.LT.AND P2, PT, R83.reuse, c[0x0][0x148], P5;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, R83, c[0x0][0x148], P6;
+01:-:-:-:1  @P0 STG.CG [R76], R64;
+--:-:-:-:1      ISETP.LT.AND P0, PT, R86.reuse, c[0x0][0x148], P5;
+--:-:-:-:1  @P1 STG.CG [R76+0x100], R65;
+--:-:-:-:1      ISETP.LT.AND P1, PT, R86, c[0x0][0x148], P6;
+--:-:-:-:1  @P2 STG.CG [R75], R66;
+--:-:-:-:1      ISETP.LT.AND P2, PT, R87.reuse, c[0x0][0x148], P5;
+--:-:-:-:1  @P3 STG.CG [R75+0x100], R67;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, R87, c[0x0][0x148], P6;
+--:-:-:-:2  @P0 STG.CG [R79], R68;
+--:-:-:-:2  @P1 STG.CG [R79+0x100], R69;
+--:-:-:-:2  @P2 STG.CG [R82], R70;
+--:2:-:-:1  @P3 STG.CG [R82+0x100], R71;
+--:-:-:-:5      RET;
diff --git a/Assembler/MaxAs/sgemm/sgemm_final_64.sass b/Assembler/MaxAs/sgemm/sgemm_final_64.sass
new file mode 100644
index 0000000..815ae5d
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm_final_64.sass
@@ -0,0 +1,802 @@
+# Kernel: sgemm_kernel_64
+# Arch: sm_50
+# InsCnt: 779
+# RegCnt: 127
+# SharedSize: 8192
+# BarCnt: 1
+# Params(9):
+#	ord:addr:size:align
+#	0:0x140:4:0
+#	1:0x144:4:0
+#	2:0x148:4:0
+#	3:0x14c:4:0
+#	4:0x150:4:0
+#	5:0x154:4:0
+#	6:0x158:4:0
+#	7:0x15c:4:0
+#	8:0x160:4:0
+#
+# Instructions:
+
+--:-:1:-:1      S2R R119, SR_TID.X;
+--:-:2:-:1      S2R R125, SR_CTAID.X;
+--:-:3:-:1      S2R R122, SR_CTAID.Y;
+01:-:-:-:1      ISETP.GE.AND P0, PT, R119.reuse, 0x20, PT;
+--:-:-:-:1      LOP.AND R9, R119.reuse, 0xf;
+--:-:-:-:1      BFE.U32 R4, R119.reuse, 0x104;
+--:-:-:-:1      MOV R12, c[0x0][0x14c];
+--:-:-:-:1      BFE.U32 R114, R119.reuse, 0x301;
+--:-:-:-:1      LOP.AND R115, R119.reuse, 0x30;
+--:-:-:-:1      LOP.AND R0, R119.reuse, 0x1;
+--:-:-:-:1      SHL R13, R9, 0x4;
+--:-:-:-:1      LOP.AND R80, R119.reuse, -0x20;
+--:-:-:-:1      IADD R12, R12, -0x8;
+--:-:-:-:1      SHL R114, R114, 0x4;
+--:-:-:-:1      LOP.AND R126, R119, 0x1f;
+--:-:-:-:1      SHR.U32 R115, R115, 0x3;
+--:-:-:-:0 @!P0 MOV R2, c[0x0][0x150];
+--:-:-:-:1      STS.128 [R80+0x1000], RZ;
+--:-:-:-:1  @P0 MOV R2, c[0x0][0x154];
+--:-:-:-:1      ISCADD R118, R4, R13, 0x8;
+06:-:-:-:1      SEL R8, R122, R125, P0;
+--:-:-:-:1 @!P0 MOV32I R113, 0x80000001;
+--:-:-:-:1  @P0 MOV32I R113, 0x80000000;
+--:-:-:-:1      LOP.OR R115, R115, R0;
+--:-:-:-:1      SHR.U32 R1, R2.reuse, 0x2;
+--:-:-:-:1      LOP.AND R123, R119, 0x20;
+--:-:-:-:1      ISCADD R112, R8, R9, 0x4;
+--:-:-:-:1      IADD R121, R2, R2;
+--:-:-:-:1  @P0 IADD R118, R118, 0x800;
+--:-:-:-:1      ISCADD R115, R115, 0x800, 0x4;
+--:-:-:-:1      XMAD.MRG R5, R1.reuse, R4.H1.reuse, RZ;
+--:-:-:-:1      XMAD.MRG R16, R12.reuse, R1.H1.reuse, RZ;
+--:-:-:Y:6      XMAD R112, R1.reuse, R4, R112;
+--:-:-:-:2      XMAD.PSL.CBCC R112, R1.H1, R5.H1, R112;
+--:-:1:-:4      TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf;
+--:-:-:-:1      IADD3 R116, R112.reuse, R1.reuse, R1;
+--:-:-:-:1      IADD R120, R112, R2.reuse;
+--:-:2:-:1      TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf;
+--:-:-:-:0      XMAD R117, R12.reuse, R1, R112;
+--:-:3:-:3      TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf;
+--:-:-:-:2      IADD R124, R116, R2;
+--:-:4:-:1      TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf;
+--:-:-:-:1      XMAD.PSL.CBCC R117, R12.H1, R16.H1, R117;
+--:-:5:-:1      LDS.U.128 R0, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R4, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R8, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R12, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R16, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R20, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R24, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R28, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R32, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R36, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R40, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R44, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R48, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R52, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R56, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R60, [R80+0x1000];
+01:-:-:-:1      STS.128 [R118], R96;
+--:-:-:-:0      IADD R112, R112, R121.reuse;
+02:-:-:-:1      STS.128 [R118+0x200], R100;
+--:-:-:-:0      IADD R116, R116, R121.reuse;
+04:-:-:-:1      STS.128 [R118+0x400], R104;
+--:-:-:-:0      IADD R120, R120, R121.reuse;
+08:-:-:-:1      STS.128 [R118+0x600], R108;
+--:-:-:-:0      IADD R124, R124, R121;
+10:-:-:-:5      BAR.SYNC 0x0;
+--:-:-:-:0      LOP.XOR R118, R118, 0x1000;
+--:-:-:-:1      LDS.U.128 R64, [R114];
+--:-:-:-:1      LDS.U.128 R72, [R115];
+--:-:-:-:1      LDS.U.128 R68, [R114+0x80];
+--:-:1:-:1      LDS.U.128 R76, [R115+0x80];
+TARGET1:
+--:-:-:-:1      ISETP.LE.AND P0, PT, R112, R117, PT;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R114+0x100];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R115+0x100];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R114+0x180];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R115+0x180];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:0      FFMA R11, R64.reuse, R74, R11;
+--:-:-:-:1  @P0 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:0      FFMA R16, R66, R77.reuse, R16;
+--:-:2:-:1  @P0 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R114+0x200];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R115+0x200];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R114+0x280];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R115+0x280];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:0      FFMA R11, R80.reuse, R90, R11;
+--:-:-:-:1  @P0 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:0      FFMA R16, R82, R93.reuse, R16;
+--:-:3:-:1  @P0 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R114+0x300];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R115+0x300];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R114+0x380];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R115+0x380];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R114+0x400];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R115+0x400];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R114+0x480];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R115+0x480];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R114+0x500];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R115+0x500];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R114+0x580];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R115+0x580];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R114+0x600];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R115+0x600];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R114+0x680];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R115+0x680];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:0      FFMA R10, R80.reuse, R91, R10;
+02:-:-:-:1  @P0 STS.128 [R118], R96;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:0      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1  @P0 STS.128 [R118+0x200], R100;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R114+0x700];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R115+0x700];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R114+0x780];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R115+0x780];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:0      FFMA R10, R64.reuse, R75, R10;
+04:-:-:-:1  @P0 STS.128 [R118+0x400], R104;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:0      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1  @P0 STS.128 [R118+0x600], R108;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:0      FFMA R26, R64.reuse, R79, R26;
+01:-:-:-:5      BAR.SYNC 0x0;
+--:-:-:-:1  @P0 LOP.XOR R114, R114, 0x1000;
+--:-:-:-:1  @P0 LOP.XOR R115, R115, 0x1000;
+--:-:-:-:1  @P0 LOP.XOR R118, R118, 0x1000;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+--:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1  @P0 LDS.U.128 R64, [R114];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1  @P0 LDS.U.128 R72, [R115];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1  @P0 LDS.U.128 R68, [R114+0x80];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1  @P0 LDS.U.128 R76, [R115+0x80];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+--:-:-:-:1  @P0 IADD R112, R112, R121.reuse;
+--:-:-:-:1  @P0 IADD R116, R116, R121.reuse;
+--:-:-:-:1  @P0 IADD R120, R120, R121.reuse;
+--:-:-:-:0  @P0 IADD R124, R124, R121;
+--:-:-:Y:5  @P0 BRA TARGET1;
+--:-:-:-:1      SHR.U32 R80, R123.reuse, 0x1;
+--:-:-:-:1      MOV R81, c[0x0][0x158];
+--:-:-:-:1      ISCADD R84, R125, R126.reuse, 0x6;
+--:-:-:-:1      MOV R72, c[0x0][0x15c];
+--:-:-:-:1      ISCADD R92, R123, R126, 0x3;
+--:-:-:-:1      LOP.AND R114, R114, 0x7ff;
+--:-:-:-:1      ISCADD R80, R122, R80, 0x6;
+--:-:-:-:1      LOP.AND R115, R115, 0x7ff;
+--:-:-:-:1      SHL R77, R81.reuse, 0x2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT;
+--:-:-:-:1      SHL R89, R81.reuse, 0x4;
+--:-:-:-:1      FMUL R64, R3, R72;
+--:-:-:-:1      SHL R91, R81.reuse, 0x5;
+--:-:-:-:1      XMAD.MRG R74, R80.reuse, R81.H1.reuse, RZ;
+--:-:-:-:1      ISCADD R93, R115, R114, 0x4;
+--:-:-:-:1      XMAD R73, R80, R81, R84;
+--:-:-:-:1      SHL R92, R92, 0x2;
+--:-:-:-:1      IADD R84, R84, 0x20;
+--:-:-:-:1      ISCADD R85, R81, -R89, 0x7;
+--:-:-:-:1      FMUL R65, R7, R72.reuse;
+--:-:-:-:1      FMUL R66, R1, R72.reuse;
+--:-:-:-:1      XMAD.PSL.CBCC R73, R80.H1, R74.H1, R73;
+--:-:-:-:1      IADD R80, R80, -0x1;
+--:-:-:-:1      ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT;
+--:-:-:-:1      FMUL R67, R5, R72.reuse;
+--:-:-:-:1      FMUL R68, R35, R72.reuse;
+--:-:-:-:1      FMUL R69, R39, R72.reuse;
+--:-:-:-:1      ISCADD R76, R73, c[0x0][0x140], 0x2;
+--:-:-:-:1      IADD R86, R80.reuse, 0x4;
+--:-:-:-:1      IADD R87, R80.reuse, 0x8;
+--:-:-:-:1      IADD R88, R80, 0xc;
+--:-:-:-:1      FMUL R70, R33, R72.reuse;
+--:-:-:-:1      FMUL R71, R37, R72;
+--:-:-:Y:6      IADD R76, R76, -R77;
+--:-:-:-:1      IADD R75, R76.reuse, R89;
+--:-:-:Y:5      IADD R78, R76, R91.reuse;
+--:-:-:-:0      IADD R79, R75, R91;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R2, R72.reuse;
+--:-:-:-:1      FMUL R65, R6, R72.reuse;
+--:-:-:-:1      FMUL R66, R0, R72.reuse;
+--:-:-:-:1      FMUL R67, R4, R72.reuse;
+--:-:-:-:1      FMUL R68, R34, R72.reuse;
+--:-:-:-:1      FMUL R69, R38, R72.reuse;
+--:-:-:-:1      FMUL R70, R32, R72.reuse;
+--:-:-:-:0      FMUL R71, R36, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R11, R72.reuse;
+--:-:-:-:1      FMUL R65, R15, R72.reuse;
+--:-:-:-:1      FMUL R66, R9, R72.reuse;
+--:-:-:-:1      FMUL R67, R13, R72.reuse;
+--:-:-:-:1      FMUL R68, R43, R72.reuse;
+--:-:-:-:1      FMUL R69, R47, R72.reuse;
+--:-:-:-:1      FMUL R70, R41, R72.reuse;
+--:-:-:-:0      FMUL R71, R45, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R10, R72.reuse;
+--:-:-:-:1      FMUL R65, R14, R72.reuse;
+--:-:-:-:1      FMUL R66, R8, R72.reuse;
+--:-:-:-:1      FMUL R67, R12, R72.reuse;
+--:-:-:-:1      FMUL R68, R42, R72.reuse;
+--:-:-:-:1      FMUL R69, R46, R72.reuse;
+--:-:-:-:1      FMUL R70, R40, R72.reuse;
+--:-:-:-:0      FMUL R71, R44, R72;
+--:-:-:-:5      CAL TARGET2;
+--:-:-:-:1      IADD R80, R80, 0x1c;
+--:-:-:-:1      IADD R86, R86, 0x1c;
+--:-:-:-:1      IADD R87, R87, 0x1c;
+--:-:-:-:1      IADD R88, R88, 0x1c;
+02:-:-:-:1      IADD R76, R76, R85.reuse;
+--:-:-:-:1      IADD R75, R75, R85.reuse;
+--:-:-:-:1      IADD R78, R78, R85.reuse;
+--:-:-:-:1      IADD R79, R79, R85;
+--:-:-:-:1      FMUL R64, R19, R72.reuse;
+--:-:-:-:1      FMUL R65, R23, R72.reuse;
+--:-:-:-:1      FMUL R66, R17, R72.reuse;
+--:-:-:-:1      FMUL R67, R21, R72.reuse;
+--:-:-:-:1      FMUL R68, R51, R72.reuse;
+--:-:-:-:1      FMUL R69, R55, R72.reuse;
+--:-:-:-:1      FMUL R70, R49, R72.reuse;
+--:-:-:-:0      FMUL R71, R53, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R18, R72.reuse;
+--:-:-:-:1      FMUL R65, R22, R72.reuse;
+--:-:-:-:1      FMUL R66, R16, R72.reuse;
+--:-:-:-:1      FMUL R67, R20, R72.reuse;
+--:-:-:-:1      FMUL R68, R50, R72.reuse;
+--:-:-:-:1      FMUL R69, R54, R72.reuse;
+--:-:-:-:1      FMUL R70, R48, R72.reuse;
+--:-:-:-:0      FMUL R71, R52, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R27, R72.reuse;
+--:-:-:-:1      FMUL R65, R31, R72.reuse;
+--:-:-:-:1      FMUL R66, R25, R72.reuse;
+--:-:-:-:1      FMUL R67, R29, R72.reuse;
+--:-:-:-:1      FMUL R68, R59, R72.reuse;
+--:-:-:-:1      FMUL R69, R63, R72.reuse;
+--:-:-:-:1      FMUL R70, R57, R72.reuse;
+--:-:-:-:0      FMUL R71, R61, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R26, R72.reuse;
+--:-:-:-:1      FMUL R65, R30, R72.reuse;
+--:-:-:-:1      FMUL R66, R24, R72.reuse;
+--:-:-:-:1      FMUL R67, R28, R72.reuse;
+--:-:-:-:1      FMUL R68, R58, R72.reuse;
+--:-:-:-:1      FMUL R69, R62, R72.reuse;
+--:-:-:-:1      FMUL R70, R56, R72.reuse;
+--:-:-:-:0      FMUL R71, R60, R72;
+--:-:-:-:5      CAL TARGET2;
+--:-:-:-:5      EXIT;
+TARGET2:
+--:-:-:-:0      IADD R80, R80, 0x1;
+--:-:-:-:1      STS.128 [R93], R64;
+--:-:-:-:0      IADD R86, R86, 0x1;
+--:-:-:-:1      STS.128 [R93+0x80], R68;
+--:-:-:-:0      IADD R87, R87, 0x1;
+--:-:-:-:1      LDS R64, [R92];
+--:-:-:-:0      IADD R88, R88, 0x1;
+--:-:-:-:1      LDS R65, [R92+0x80];
+--:-:-:-:0      IADD R76, R76, R77.reuse;
+--:-:-:-:1      LDS R66, [R92+0x100];
+--:-:-:-:0      IADD R75, R75, R77.reuse;
+--:-:-:-:1      LDS R67, [R92+0x180];
+--:-:-:-:0      IADD R78, R78, R77.reuse;
+--:-:-:-:1      LDS R68, [R92+0x200];
+--:-:-:-:0      IADD R79, R79, R77;
+--:-:-:-:1      LDS R69, [R92+0x280];
+--:-:-:-:1      ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5;
+--:-:-:-:1      LDS R70, [R92+0x300];
+--:-:-:-:1      ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6;
+--:-:1:-:1      LDS R71, [R92+0x380];
+--:-:-:-:2      ISETP.LT.AND P2, PT, R86.reuse, c[0x0][0x148], P5;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, R86, c[0x0][0x148], P6;
+01:-:-:-:1  @P0 STG.CG [R76], R64;
+--:-:-:-:1      ISETP.LT.AND P0, PT, R87.reuse, c[0x0][0x148], P5;
+--:-:-:-:1  @P1 STG.CG [R76+0x80], R65;
+--:-:-:-:1      ISETP.LT.AND P1, PT, R87, c[0x0][0x148], P6;
+--:-:-:-:1  @P2 STG.CG [R75], R66;
+--:-:-:-:1      ISETP.LT.AND P2, PT, R88.reuse, c[0x0][0x148], P5;
+--:-:-:-:1  @P3 STG.CG [R75+0x80], R67;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, R88, c[0x0][0x148], P6;
+--:-:-:-:2  @P0 STG.CG [R78], R68;
+--:-:-:-:2  @P1 STG.CG [R78+0x80], R69;
+--:-:-:-:2  @P2 STG.CG [R79], R70;
+--:2:-:-:1  @P3 STG.CG [R79+0x80], R71;
+--:-:-:-:5      RET;
diff --git a/Assembler/MaxAs/sgemm/sgemm_pre_128.sass b/Assembler/MaxAs/sgemm/sgemm_pre_128.sass
new file mode 100644
index 0000000..cde320e
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm_pre_128.sass
@@ -0,0 +1,924 @@
+# Kernel: sgemm_kernel_128
+#
+# SharedSize: 16384
+# Params(8):
+#   0:0x140:4:4 param_C,
+#   1:0x144:4:0 param_m,
+#   2:0x148:4:0 param_n,
+#   3:0x14c:4:0 param_k,
+#   4:0x150:4:0 param_lda,
+#   5:0x154:4:0 param_ldb,
+#   6:0x158:4:0 param_ldc
+#   7:0x15c:4:0 param_alpha
+#   8:0x160:4:4 param_D // for diagnostic printf output
+#
+# Globals:
+#   c[0x0][0x164]: texA (the value is 1)
+#   c[0x0][0x168]: texB (the value is 0)
+
+<REGISTER_MAPPING>
+
+    // Temporary registers to calculate the state registers. Reuse the C output registers.
+    // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts.
+    0-63    ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy
+
+    // Aliases for the C registers we use for initializing C (used as vectors)
+    0-63    : cz<00-63>
+
+    // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers
+    80      : zOffset
+
+    // 64 C maxtrix output registers.
+    // Use special mapping to avoid register bank conflicts between these registers and the blocking registers.
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
+    35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
+    39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
+    33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
+    37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
+
+    // Double buffered register blocking used in vector loads.
+    // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags
+    64-79   : j0Ax<00-03|64-67>, j0By<00-03|64-67>
+    80-95   : j1Ax<00-03|64-67>, j1By<00-03|64-67>
+
+    // Registers to load A or B
+    96-103  : loadX<0-7>
+
+    // Key global state registers for main loop and some we reuse for outputing C.
+    // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of
+    // delayed bank conflicts between memory operations and ffmas.
+    // The array index bracket notation can be used to request a bank in a dynamically allocated range.
+    104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs
+
+    // Registers to store the results back to global memory. Reuse any register not needed after the main loop.
+    // Statically allocate cs0-7 because they're vector registers.
+    64-71   : cs<0-7>
+
+    // dynamically allocated C output registers(~)
+    72-103  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX
+
+</REGISTER_MAPPING>
+
+// Note the absense of the loading of the stack pointer into R1.
+// No idea why ptxas does that anyway when it's not used for register spilling.
+// Such a waste of a perfectly good register.
+
+// Scheduler doesn't handle the dependency flags yet,
+// so move these first instructions outside the block that's auto scheduled
+//--:-:-:-:1      CS2R clock, SR_CLOCKLO;
+//--:-:-:-:1      S2R smId, SR_VIRTID;
+//--:-:-:-:1      S2R nSMs, SR_VIRTCFG;
+--:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
+--:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
+--:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
+
+// Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies
+// Memory dependencies are left up to the auther to deal with manually for now.
+01:-:-:Y:1      ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      BFE.U32 tid4, tid, 0x205; // 2 bits at position 5
+--:-:-:-:1      MOV k, c[0x0][0x14c];
+--:-:-:-:1      BFE.U32 tid7, tid, 0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.AND tid128, tid, 128;
+--:-:-:-:1      LOP.AND readBs, tid, 0x70;
+--:-:-:-:1      SHL tid31_4, tid31, 4;
+--:-:-:-:1      LOP.AND tid1, tid, 1;
+--:-:-:-:1      IADD k, k, -8;
+--:-:-:-:1      LOP.AND zOffset, tid, -32;
+--:-:-:-:1      SHR.U32 readAs, tid128, 4;
+--:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:0 @!P0 MOV ldx4, c[0x0][0x150];
+--:-:-:-:1      STS.128 [zOffset + 4x<16*128>], RZ;
+--:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
+--:-:-:-:1      ISCADD writeS, tid4, tid31_4, 9;
+06:-:-:-:1      SEL blk, by, bx, P0;               // Wait Dep 2 & 3
+--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
+--:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
+--:-:-:-:1      LOP.OR readAs, readAs, tid7;
+--:-:-:-:1      SHR.U32 ldx, ldx4, 2;
+--:-:-:-:1      LOP.OR readBs, readBs, tid1;
+--:-:-:-:1      ISCADD track0, blk, tid31, 5;
+--:-:-:-:1      IADD ldx8, ldx4, ldx4;
+--:-:-:-:1  @P0 IADD writeS, writeS, 4x<8*128>;
+--:-:-:-:1      SHL readAs, readAs, 4;
+--:-:-:-:1      XMAD.MRG xmad_t0, ldx, tid4.H1, RZ; // XMAD.LO is a macro that is expanded out into the 3 XMADs
+--:-:-:-:1      ISCADD readBs, readBs, 4x<8*128>, 4;
+--:-:-:-:1      XMAD track0, ldx, tid4, track0;
+--:-:-:Y:5      XMAD.MRG xmad_end, k, ldx.H1, RZ;
+--:-:-:-:2      XMAD.PSL.CBCC track0, ldx.H1, xmad_t0.H1, track0;
+--:-:1:-:4      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:-:-:1      IADD track4, track0, ldx4;
+--:-:-:-:1      XMAD end, k, ldx, track0;
+--:-:2:Y:5      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:-:1      XMAD.PSL.CBCC end, k.H1, xmad_end.H1, end;
+
+// Initialize C registeres to zero
+// Using LDS.U.128 is a neat trick to save a few clock cyles
+// (when you have enough warps to hide the latency.)
+--:-:3:-:1      LDS.U.128 cz00, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz04, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz08, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz12, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz16, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz20, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz24, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz28, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz32, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz36, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz40, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz44, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz48, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz52, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz56, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz60, [zOffset + 4x<16*128>];
+
+// These instuctions need to occur after the textures load so put them in a new block
+// that starts with a dependency barrier wait.
+01:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1
+--:-:-:-:0      IADD track0, track0, ldx8;
+02:-:-:-:1      STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2
+--:-:-:-:0      IADD track4, track4, ldx8;
+04:-:-:-:5      BAR.SYNC 0;
+
+// The next store to shared goes to high area.
+// Having 2 share buffers allows us to eliminate a bar.sync in the main loop.
+// This way we don't have to wait for all threads to arrive before writing fresh data to shared.
+// Other threads can continue reading from the last batch while the new data is being written.
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*128>;
+
+// Preload the fist lines of A and B from shared
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1
+
+
+// The main loop
+// While calculating the first line, load in the next line from shared.
+// Shared memory stores enough to do this 8 times per loop.
+// Also pull in the next block of memory from global and store it to shared.
+
+// Efficiency:
+// ffma: 512
+// lds:  32 dual issued
+// sts:  2  dual issued
+// tex:  2  dual issued
+// add:  2
+// xor:  3
+// setp: 1
+// bar:  1  dual issued
+// bra:  1  dual issued
+// Total: 524 (512/518 = 98.8% FFMA)
+
+// Memory Throughput Upper Bound:
+// 2 * 4 * 4 bytes per thread per 518 clocks
+// 128 threads per SM
+// 16 SM's (GM204)
+// 1640Mhz (boost overclock)
+// .931 GiB/GB  (1000^3 / 1024^3)
+// 193 GiB/sec
+// Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz)
+
+LOOP:
+
+// Loop end condition
+--:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
+
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<1*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<1*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax64, [readAs + 4x<1*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By64, [readBs + 4x<1*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j0Ax66, j0By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j0Ax66, j0By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j0Ax64, j0By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j0Ax64, j0By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j0Ax67, j0By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j0Ax67, j0By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j0Ax65, j0By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j0Ax65, j0By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j0Ax67, j0By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j0Ax67, j0By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j0Ax65, j0By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j0Ax65, j0By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j0Ax66, j0By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j0Ax66, j0By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j0Ax64, j0By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j0Ax64, j0By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:0      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:2:-:1  @P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:Y:1      FFMA cx02y64, j0Ax02, j0By64, cx02y64;
+--:-:-:-:0      FFMA cx02y65, j0Ax02, j0By65, cx02y65;
+--:-:3:-:1  @P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3
+--:-:-:-:1      FFMA cx00y65, j0Ax00, j0By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j0Ax00, j0By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j0Ax03, j0By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j0Ax03, j0By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j0Ax01, j0By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j0Ax01, j0By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j0Ax66, j0By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j0Ax66, j0By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j0Ax64, j0By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j0Ax64, j0By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j0Ax67, j0By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j0Ax67, j0By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j0Ax65, j0By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j0Ax65, j0By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j0Ax67, j0By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j0Ax67, j0By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j0Ax65, j0By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j0Ax65, j0By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j0Ax66, j0By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j0Ax66, j0By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j0Ax64, j0By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j0Ax64, j0By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j0Ax03, j0By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j0Ax03, j0By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j0Ax01, j0By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j0Ax01, j0By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j0Ax02, j0By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j0Ax02, j0By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j0Ax00, j0By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j0Ax00, j0By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<2*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<2*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<2*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<2*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j1Ax66, j1By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j1Ax66, j1By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j1Ax64, j1By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j1Ax64, j1By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j1Ax67, j1By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j1Ax67, j1By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j1Ax65, j1By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j1Ax65, j1By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j1Ax67, j1By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j1Ax67, j1By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j1Ax65, j1By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j1Ax65, j1By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j1Ax66, j1By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j1Ax66, j1By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j1Ax64, j1By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j1Ax64, j1By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j1Ax02, j1By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j1Ax02, j1By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j1Ax00, j1By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j1Ax00, j1By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j1Ax03, j1By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j1Ax03, j1By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j1Ax01, j1By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j1Ax01, j1By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j1Ax66, j1By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j1Ax66, j1By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j1Ax64, j1By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j1Ax64, j1By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j1Ax67, j1By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j1Ax67, j1By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j1Ax65, j1By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j1Ax65, j1By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j1Ax67, j1By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j1Ax67, j1By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j1Ax65, j1By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j1Ax65, j1By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j1Ax66, j1By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j1Ax66, j1By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j1Ax64, j1By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j1Ax64, j1By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j1Ax03, j1By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j1Ax03, j1By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j1Ax01, j1By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j1Ax01, j1By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j1Ax02, j1By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j1Ax02, j1By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j1Ax00, j1By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j1Ax00, j1By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<3*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<3*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax64, [readAs + 4x<3*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By64, [readBs + 4x<3*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j0Ax66, j0By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j0Ax66, j0By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j0Ax64, j0By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j0Ax64, j0By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j0Ax67, j0By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j0Ax67, j0By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j0Ax65, j0By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j0Ax65, j0By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j0Ax67, j0By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j0Ax67, j0By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j0Ax65, j0By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j0Ax65, j0By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j0Ax66, j0By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j0Ax66, j0By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j0Ax64, j0By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j0Ax64, j0By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j0Ax02, j0By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j0Ax02, j0By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j0Ax00, j0By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j0Ax00, j0By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j0Ax03, j0By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j0Ax03, j0By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j0Ax01, j0By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j0Ax01, j0By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j0Ax66, j0By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j0Ax66, j0By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j0Ax64, j0By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j0Ax64, j0By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j0Ax67, j0By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j0Ax67, j0By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j0Ax65, j0By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j0Ax65, j0By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j0Ax67, j0By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j0Ax67, j0By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j0Ax65, j0By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j0Ax65, j0By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j0Ax66, j0By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j0Ax66, j0By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j0Ax64, j0By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j0Ax64, j0By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j0Ax03, j0By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j0Ax03, j0By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j0Ax01, j0By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j0Ax01, j0By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j0Ax02, j0By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j0Ax02, j0By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j0Ax00, j0By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j0Ax00, j0By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<4*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<4*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<4*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<4*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j1Ax66, j1By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j1Ax66, j1By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j1Ax64, j1By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j1Ax64, j1By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j1Ax67, j1By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j1Ax67, j1By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j1Ax65, j1By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j1Ax65, j1By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j1Ax67, j1By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j1Ax67, j1By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j1Ax65, j1By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j1Ax65, j1By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j1Ax66, j1By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j1Ax66, j1By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j1Ax64, j1By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j1Ax64, j1By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j1Ax02, j1By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j1Ax02, j1By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j1Ax00, j1By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j1Ax00, j1By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j1Ax03, j1By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j1Ax03, j1By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j1Ax01, j1By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j1Ax01, j1By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j1Ax66, j1By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j1Ax66, j1By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j1Ax64, j1By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j1Ax64, j1By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j1Ax67, j1By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j1Ax67, j1By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j1Ax65, j1By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j1Ax65, j1By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j1Ax67, j1By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j1Ax67, j1By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j1Ax65, j1By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j1Ax65, j1By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j1Ax66, j1By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j1Ax66, j1By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j1Ax64, j1By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j1Ax64, j1By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j1Ax03, j1By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j1Ax03, j1By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j1Ax01, j1By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j1Ax01, j1By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j1Ax02, j1By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j1Ax02, j1By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j1Ax00, j1By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j1Ax00, j1By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<5*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<5*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax64, [readAs + 4x<5*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By64, [readBs + 4x<5*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j0Ax66, j0By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j0Ax66, j0By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j0Ax64, j0By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j0Ax64, j0By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j0Ax67, j0By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j0Ax67, j0By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j0Ax65, j0By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j0Ax65, j0By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j0Ax67, j0By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j0Ax67, j0By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j0Ax65, j0By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j0Ax65, j0By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j0Ax66, j0By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j0Ax66, j0By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j0Ax64, j0By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j0Ax64, j0By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j0Ax02, j0By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j0Ax02, j0By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j0Ax00, j0By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j0Ax00, j0By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j0Ax03, j0By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j0Ax03, j0By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j0Ax01, j0By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j0Ax01, j0By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j0Ax66, j0By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j0Ax66, j0By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j0Ax64, j0By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j0Ax64, j0By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j0Ax67, j0By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j0Ax67, j0By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j0Ax65, j0By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j0Ax65, j0By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j0Ax67, j0By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j0Ax67, j0By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j0Ax65, j0By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j0Ax65, j0By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j0Ax66, j0By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j0Ax66, j0By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j0Ax64, j0By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j0Ax64, j0By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j0Ax03, j0By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j0Ax03, j0By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j0Ax01, j0By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j0Ax01, j0By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j0Ax02, j0By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j0Ax02, j0By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j0Ax00, j0By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j0Ax00, j0By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<6*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<6*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<6*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<6*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j1Ax66, j1By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j1Ax66, j1By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j1Ax64, j1By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j1Ax64, j1By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j1Ax67, j1By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j1Ax67, j1By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j1Ax65, j1By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j1Ax65, j1By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j1Ax67, j1By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j1Ax67, j1By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j1Ax65, j1By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j1Ax65, j1By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j1Ax66, j1By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j1Ax66, j1By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j1Ax64, j1By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j1Ax64, j1By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j1Ax02, j1By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j1Ax02, j1By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j1Ax00, j1By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j1Ax00, j1By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j1Ax03, j1By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j1Ax03, j1By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j1Ax01, j1By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j1Ax01, j1By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j1Ax66, j1By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j1Ax66, j1By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j1Ax64, j1By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j1Ax64, j1By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j1Ax67, j1By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j1Ax67, j1By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j1Ax65, j1By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j1Ax65, j1By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j1Ax67, j1By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j1Ax67, j1By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j1Ax65, j1By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j1Ax65, j1By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j1Ax66, j1By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j1Ax66, j1By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j1Ax64, j1By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j1Ax64, j1By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j1Ax03, j1By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j1Ax03, j1By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j1Ax01, j1By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j1Ax01, j1By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j1Ax02, j1By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j1Ax02, j1By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j1Ax00, j1By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j1Ax00, j1By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<7*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<7*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax64, [readAs + 4x<7*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By64, [readBs + 4x<7*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j0Ax66, j0By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j0Ax66, j0By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j0Ax64, j0By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j0Ax64, j0By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j0Ax67, j0By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j0Ax67, j0By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j0Ax65, j0By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j0Ax65, j0By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j0Ax67, j0By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j0Ax67, j0By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j0Ax65, j0By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j0Ax65, j0By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j0Ax66, j0By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j0Ax66, j0By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j0Ax64, j0By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j0Ax64, j0By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:0      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+02:-:-:-:1  @P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j0Ax02, j0By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j0Ax02, j0By65, cx02y65;
+--:-:-:-:0      FFMA cx00y65, j0Ax00, j0By65, cx00y65;
+04:-:-:-:1  @P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3
+--:-:-:-:1      FFMA cx00y64, j0Ax00, j0By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j0Ax03, j0By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j0Ax03, j0By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j0Ax01, j0By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j0Ax01, j0By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j0Ax66, j0By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j0Ax66, j0By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j0Ax64, j0By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j0Ax64, j0By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j0Ax67, j0By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j0Ax67, j0By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j0Ax65, j0By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j0Ax65, j0By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j0Ax67, j0By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j0Ax67, j0By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j0Ax65, j0By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j0Ax65, j0By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j0Ax66, j0By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j0Ax66, j0By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j0Ax64, j0By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j0Ax64, j0By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j0Ax03, j0By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j0Ax03, j0By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j0Ax01, j0By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j0Ax01, j0By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j0Ax02, j0By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j0Ax02, j0By67, cx02y67;
+--:-:-:-:0      FFMA cx00y67, j0Ax00, j0By67, cx00y67;
+01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1
+--:-:-:-:1  @P0 LOP.XOR readAs, readAs, 4x<16*128>;
+--:-:-:-:1  @P0 LOP.XOR readBs, readBs, 4x<16*128>;
+--:-:-:-:1  @P0 LOP.XOR writeS, writeS, 4x<16*128>;
+--:-:-:-:1      FFMA cx00y66, j0Ax00, j0By66, cx00y66;
+--:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00;
+--:-:-:-:1  @P0 LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1  @P0 LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1  @P0 LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1  @P0 LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j1Ax66, j1By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j1Ax66, j1By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j1Ax64, j1By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j1Ax64, j1By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j1Ax67, j1By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j1Ax67, j1By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j1Ax65, j1By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j1Ax65, j1By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j1Ax67, j1By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j1Ax67, j1By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j1Ax65, j1By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j1Ax65, j1By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j1Ax66, j1By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j1Ax66, j1By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j1Ax64, j1By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j1Ax64, j1By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j1Ax02, j1By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j1Ax02, j1By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j1Ax00, j1By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j1Ax00, j1By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j1Ax03, j1By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j1Ax03, j1By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j1Ax01, j1By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j1Ax01, j1By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j1Ax66, j1By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j1Ax66, j1By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j1Ax64, j1By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j1Ax64, j1By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j1Ax67, j1By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j1Ax67, j1By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j1Ax65, j1By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j1Ax65, j1By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j1Ax67, j1By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j1Ax67, j1By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j1Ax65, j1By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j1Ax65, j1By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j1Ax66, j1By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j1Ax66, j1By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j1Ax64, j1By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j1Ax64, j1By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j1Ax03, j1By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j1Ax03, j1By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j1Ax01, j1By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j1Ax01, j1By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j1Ax02, j1By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j1Ax02, j1By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j1Ax00, j1By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j1Ax00, j1By66, cx00y66;
+--:-:-:-:1  @P0 IADD track0, track0, ldx8;
+--:-:-:-:0  @P0 IADD track4, track4, ldx8;
+--:-:-:Y:5  @P0 BRA LOOP;
+
+// Main loop is done, time to write C to global memory.
+--:-:-:-:1      SHR.U32 cx, tid128, 2;
+--:-:-:-:1      MOV ldc, c[0x0][0x158];
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      MOV alpha, c[0x0][0x15c];
+--:-:-:-:1      SHL readCs, tid96, 4;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
+--:-:-:-:1      LOP.OR cx, tid31, cx;
+--:-:-:-:1      SHL ldc1, ldc, 2;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
+--:-:-:-:1      ISCADD cy00, by, cy00, 7;
+--:-:-:-:1      FMUL cs0, cx00y00, alpha;
+--:-:-:-:1      SHL ldc4, ldc, 4;
+--:-:-:-:1      LOP.OR readCs, readCs, cx;
+--:-:-:-:1      ISCADD cx, bx, cx, 7;
+--:-:-:-:1      FMUL cs1, cx01y00, alpha;
+--:-:-:-:1      SHL ldc8, ldc, 5;
+--:-:-:-:1      XMAD.MRG xmad_ci, cy00, ldc.H1, RZ;
+--:-:-:-:1      ISCADD writeCs, readBs, readAs, 5;
+--:-:-:-:1      FMUL cs2, cx02y00, alpha;
+--:-:-:-:1      SHL readCs, readCs, 2;
+--:-:-:-:1      XMAD ci, cy00, ldc, cx;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
+--:-:-:-:1      IADD cx, cx, 64;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+--:-:-:-:1      FMUL cs3, cx03y00, alpha;
+--:-:-:-:1      FMUL cs4, cx64y00, alpha;
+--:-:-:-:1      XMAD.PSL.CBCC ci, cy00.H1, xmad_ci.H1, ci;
+--:-:-:-:1      IADD cy00, cy00, -1;
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
+--:-:-:-:1      FMUL cs5, cx65y00, alpha;
+--:-:-:-:1      FMUL cs6, cx66y00, alpha;
+--:-:-:-:1      FMUL cs7, cx67y00, alpha;
+--:-:-:-:1      ISCADD Cy00, ci, c[0x0][0x140], 2;
+--:-:-:-:1      IADD cy04, cy00, 4;
+--:-:-:-:1      IADD cy08, cy00, 8;
+--:-:-:-:3      IADD cy12, cy00, 12;
+--:-:-:Y:6      IADD Cy00, Cy00, -ldc1;
+--:-:-:-:1      IADD Cy04, Cy00, ldc4;
+--:-:-:Y:5      IADD Cy08, Cy00, ldc8;
+--:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
+
+// There's nothing yet in place to handle dependecies with subroutines.
+// So don't schedule this block.
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y01, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y01, alpha;
+--:-:-:-:1      FMUL cs2, cx02y01, alpha;
+--:-:-:-:1      FMUL cs3, cx03y01, alpha;
+--:-:-:-:1      FMUL cs4, cx64y01, alpha;
+--:-:-:-:1      FMUL cs5, cx65y01, alpha;
+--:-:-:-:1      FMUL cs6, cx66y01, alpha;
+--:-:-:-:0      FMUL cs7, cx67y01, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y02, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y02, alpha;
+--:-:-:-:1      FMUL cs2, cx02y02, alpha;
+--:-:-:-:1      FMUL cs3, cx03y02, alpha;
+--:-:-:-:1      FMUL cs4, cx64y02, alpha;
+--:-:-:-:1      FMUL cs5, cx65y02, alpha;
+--:-:-:-:1      FMUL cs6, cx66y02, alpha;
+--:-:-:-:0      FMUL cs7, cx67y02, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y03, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y03, alpha;
+--:-:-:-:1      FMUL cs2, cx02y03, alpha;
+--:-:-:-:1      FMUL cs3, cx03y03, alpha;
+--:-:-:-:1      FMUL cs4, cx64y03, alpha;
+--:-:-:-:1      FMUL cs5, cx65y03, alpha;
+--:-:-:-:1      FMUL cs6, cx66y03, alpha;
+--:-:-:-:0      FMUL cs7, cx67y03, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      IADD cy00, cy00, 60;
+--:-:-:-:1      IADD cy04, cy04, 60;
+--:-:-:-:1      IADD cy08, cy08, 60;
+--:-:-:-:1      IADD cy12, cy12, 60;
+
+02:-:-:-:1      IADD Cy00, Cy00, ldc60; // Wait Dep 2
+--:-:-:-:1      IADD Cy04, Cy04, ldc60;
+--:-:-:-:1      IADD Cy08, Cy08, ldc60;
+--:-:-:-:1      IADD Cy12, Cy12, ldc60;
+
+--:-:-:-:1      FMUL cs0, cx00y64, alpha;
+--:-:-:-:1      FMUL cs1, cx01y64, alpha;
+--:-:-:-:1      FMUL cs2, cx02y64, alpha;
+--:-:-:-:1      FMUL cs3, cx03y64, alpha;
+--:-:-:-:1      FMUL cs4, cx64y64, alpha;
+--:-:-:-:1      FMUL cs5, cx65y64, alpha;
+--:-:-:-:1      FMUL cs6, cx66y64, alpha;
+--:-:-:-:0      FMUL cs7, cx67y64, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y65, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y65, alpha;
+--:-:-:-:1      FMUL cs2, cx02y65, alpha;
+--:-:-:-:1      FMUL cs3, cx03y65, alpha;
+--:-:-:-:1      FMUL cs4, cx64y65, alpha;
+--:-:-:-:1      FMUL cs5, cx65y65, alpha;
+--:-:-:-:1      FMUL cs6, cx66y65, alpha;
+--:-:-:-:0      FMUL cs7, cx67y65, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y66, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y66, alpha;
+--:-:-:-:1      FMUL cs2, cx02y66, alpha;
+--:-:-:-:1      FMUL cs3, cx03y66, alpha;
+--:-:-:-:1      FMUL cs4, cx64y66, alpha;
+--:-:-:-:1      FMUL cs5, cx65y66, alpha;
+--:-:-:-:1      FMUL cs6, cx66y66, alpha;
+--:-:-:-:0      FMUL cs7, cx67y66, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y67, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y67, alpha;
+--:-:-:-:1      FMUL cs2, cx02y67, alpha;
+--:-:-:-:1      FMUL cs3, cx03y67, alpha;
+--:-:-:-:1      FMUL cs4, cx64y67, alpha;
+--:-:-:-:1      FMUL cs5, cx65y67, alpha;
+--:-:-:-:1      FMUL cs6, cx66y67, alpha;
+--:-:-:-:0      FMUL cs7, cx67y67, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+
+// And we'd done.  The remainder is the STORE_C subroutine that's defined at the end of the kernel.
+--:-:-:-:5      EXIT;
+
+// This routine does warp synchronous shuffling of our output data so as to be able
+// to have coalesced writes to global memory.  This is actually faster because the shared
+// memory latencies can be hidden by other warps and we're only adding a few extra clocks
+// to this thread.  Global memory here is the bottleneck and being able to half the needed
+// bandwidth at the expense of a few clocks is a modest win.  This also keeps power lower
+// and our chip running faster.
+
+// Note, the SHFL instruction doesn't help us here because we're swaping different registers
+// from different threads.
+STORE_C:
+
+--:-:-:-:0      IADD cy00, cy00, 1;
+--:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
+--:-:-:-:0      IADD cy04, cy04, 1;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], cs4;
+--:-:-:-:0      IADD cy08, cy08, 1;
+--:-:-:-:1      LDS cs0, [readCs + 4x<0*128 + 00>];
+--:-:-:-:0      IADD cy12, cy12, 1;
+--:-:-:-:1      LDS cs1, [readCs + 4x<0*128 + 64>];
+--:-:-:-:0      IADD Cy00, Cy00, ldc1;
+--:-:-:-:1      LDS cs2, [readCs + 4x<1*128 + 00>];
+--:-:-:-:0      IADD Cy04, Cy04, ldc1;
+--:-:-:-:1      LDS cs3, [readCs + 4x<1*128 + 64>];
+--:-:-:-:0      IADD Cy08, Cy08, ldc1;
+--:-:-:-:1      LDS cs4, [readCs + 4x<2*128 + 00>];
+--:-:-:-:0      IADD Cy12, Cy12, ldc1;
+--:-:-:-:1      LDS cs5, [readCs + 4x<2*128 + 64>];
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
+--:-:-:-:1      LDS cs6, [readCs + 4x<3*128 + 00>];
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m
+--:-:1:-:1      LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1
+--:-:-:-:2      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
+--:-:-:Y:7      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m
+01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
+--:-:-:-:1  @P1 STG.CG [Cy00 + 4x<64>], cs1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m
+--:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
+--:-:-:-:1  @P3 STG.CG [Cy04 + 4x<64>], cs3;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m
+--:-:-:-:2  @P0 STG.CG [Cy08 + 4x<00>], cs4;
+--:-:-:-:2  @P1 STG.CG [Cy08 + 4x<64>], cs5;
+--:-:-:-:2  @P2 STG.CG [Cy12 + 4x<00>], cs6;
+--:2:-:-:1  @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2
+
+--:-:-:-:5      RET;
+
diff --git a/Assembler/MaxAs/sgemm/sgemm_pre_64.sass b/Assembler/MaxAs/sgemm/sgemm_pre_64.sass
new file mode 100644
index 0000000..aa2719e
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm_pre_64.sass
@@ -0,0 +1,867 @@
+# Kernel: sgemm_kernel_64
+#
+# SharedSize: 8192
+# Params(8):
+#   0:0x140:4:4 param_C,
+#   1:0x144:4:0 param_m,
+#   2:0x148:4:0 param_n,
+#   3:0x14c:4:0 param_k,
+#   4:0x150:4:0 param_lda,
+#   5:0x154:4:0 param_ldb,
+#   6:0x158:4:0 param_ldc
+#   7:0x15c:4:0 param_alpha
+#   8:0x160:4:4 param_D // for diagnostic printf output
+#
+# Globals:
+#   c[0x0][0x164]: texA (the value is 1)
+#   c[0x0][0x168]: texB (the value is 0)
+
+<REGISTER_MAPPING>
+
+    0-63    ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end
+
+    80      : zOffset
+    0-63    : cz<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35>
+    35,34,43,42,51,50,59,58 : cx32y<00-03|32-35>
+    39,38,47,46,55,54,63,62 : cx33y<00-03|32-35>
+    33,32,41,40,49,48,57,56 : cx34y<00-03|32-35>
+    37,36,45,44,53,52,61,60 : cx35y<00-03|32-35>
+
+    64-79   : j0Ax<00-03|32-35>, j0By<00-03|32-35>
+    80-95   : j1Ax<00-03|32-35>, j1By<00-03|32-35>
+
+    64-71   : cs<0-7>
+
+    96-111  : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3>
+
+    112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32
+
+    72-111  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
+--:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
+--:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
+
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1
+--:-:-:-:1      LOP.AND tid15, tid, 15;
+--:-:-:-:1      BFE.U32 tid2, tid, 0x104; // 1 bit at position 4
+--:-:-:-:1      MOV k, c[0x0][0x14c];
+--:-:-:-:1      BFE.U32 readAs, tid, 0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.AND readBs, tid, 0x30;
+--:-:-:-:1      LOP.AND tid1, tid, 1;
+--:-:-:-:1      SHL tid15_4, tid15, 4;
+--:-:-:-:1      LOP.AND zOffset, tid, -32;
+--:-:-:-:1      IADD k, k, -8;
+--:-:-:-:1      SHL readAs, readAs, 4;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:0 @!P0 MOV ldx4, c[0x0][0x150];
+--:-:-:-:1      STS.128 [zOffset + 4x<16*64>], RZ;
+--:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
+--:-:-:-:1      ISCADD writeS, tid2, tid15_4, 8;
+06:-:-:-:1      SEL blk, by, bx, P0;              // Wait Dep 2 & 3
+--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
+--:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
+--:-:-:-:1      LOP.OR readBs, readBs, tid1;
+--:-:-:-:1      SHR.U32 ldx, ldx4, 2;
+--:-:-:-:1      LOP.AND tid32, tid, 32;
+--:-:-:-:1      ISCADD track0, blk, tid15, 4;
+--:-:-:-:1      IADD ldx8, ldx4, ldx4;
+--:-:-:-:1  @P0 IADD writeS, writeS, 4x<8*64>;
+--:-:-:-:1      ISCADD readBs, readBs, 4x<8*64>, 4;
+--:-:-:-:1      XMAD.MRG xmad_t0, ldx, tid2.H1, RZ;
+--:-:-:-:1      XMAD.MRG xmad_end, k, ldx.H1, RZ;
+--:-:-:Y:6      XMAD track0, ldx, tid2, track0;
+--:-:-:-:2      XMAD.PSL.CBCC track0, ldx.H1, xmad_t0.H1, track0;
+--:-:1:-:4      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:-:-:1      IADD3 track2, track0, ldx, ldx;
+--:-:-:-:1      IADD track4, track0, ldx4;
+--:-:2:-:1      TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:-:0      XMAD end, k, ldx, track0;
+--:-:3:-:3      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:-:-:2      IADD track6, track2, ldx4;
+--:-:4:-:1      TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:-:1      XMAD.PSL.CBCC end, k.H1, xmad_end.H1, end;
+
+--:-:5:-:1      LDS.U.128 cz00, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz04, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz08, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz12, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz16, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz20, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz24, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz28, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz32, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz36, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz40, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz44, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz48, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz52, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz56, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz60, [zOffset + 4x<16*64>];
+
+01:-:-:-:1      STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1
+--:-:-:-:0      IADD track0, track0, ldx8;
+02:-:-:-:1      STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2
+--:-:-:-:0      IADD track2, track2, ldx8;
+04:-:-:-:1      STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3
+--:-:-:-:0      IADD track4, track4, ldx8;
+08:-:-:-:1      STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4
+--:-:-:-:0      IADD track6, track6, ldx8;
+10:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*64>;
+
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1
+
+// Efficiency:
+// ffma: 512
+// lds:  32 dual issued
+// sts:  4  dual issued
+// tex:  4  dual issued
+// add:  4
+// xor:  3
+// setp: 1
+// bar:  1  dual issued
+// bra:  1  dual issued
+// Total: 520 (512/520 = 98.5% FFMA)
+
+LOOP:
+
+// Loop end condition
+--:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
+
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<1*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<1*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax32, [readAs + 4x<1*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By32, [readBs + 4x<1*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j0Ax34, j0By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j0Ax34, j0By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j0Ax32, j0By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j0Ax32, j0By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j0Ax35, j0By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j0Ax35, j0By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j0Ax33, j0By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j0Ax33, j0By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j0Ax35, j0By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j0Ax35, j0By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j0Ax33, j0By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j0Ax33, j0By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j0Ax34, j0By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j0Ax34, j0By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j0Ax32, j0By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j0Ax32, j0By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:0      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:-:1  @P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA cx02y32, j0Ax02, j0By32, cx02y32;
+--:-:-:-:0      FFMA cx02y33, j0Ax02, j0By33, cx02y33;
+--:-:2:-:1  @P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:-:1      FFMA cx00y33, j0Ax00, j0By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j0Ax00, j0By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j0Ax03, j0By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j0Ax03, j0By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j0Ax01, j0By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j0Ax01, j0By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j0Ax34, j0By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j0Ax34, j0By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j0Ax32, j0By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j0Ax32, j0By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j0Ax35, j0By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j0Ax35, j0By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j0Ax33, j0By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j0Ax33, j0By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j0Ax35, j0By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j0Ax35, j0By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j0Ax33, j0By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j0Ax33, j0By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j0Ax34, j0By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j0Ax34, j0By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j0Ax32, j0By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j0Ax32, j0By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j0Ax03, j0By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j0Ax03, j0By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j0Ax01, j0By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j0Ax01, j0By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j0Ax02, j0By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j0Ax02, j0By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j0Ax00, j0By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j0Ax00, j0By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<2*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<2*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<2*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<2*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j1Ax34, j1By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j1Ax34, j1By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j1Ax32, j1By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j1Ax32, j1By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j1Ax35, j1By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j1Ax35, j1By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j1Ax33, j1By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j1Ax33, j1By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j1Ax35, j1By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j1Ax35, j1By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j1Ax33, j1By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j1Ax33, j1By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j1Ax34, j1By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j1Ax34, j1By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j1Ax32, j1By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j1Ax32, j1By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:0      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:-:1  @P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA cx02y32, j1Ax02, j1By32, cx02y32;
+--:-:-:-:0      FFMA cx02y33, j1Ax02, j1By33, cx02y33;
+--:-:3:-:1  @P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3
+--:-:-:-:1      FFMA cx00y33, j1Ax00, j1By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j1Ax00, j1By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j1Ax03, j1By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j1Ax03, j1By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j1Ax01, j1By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j1Ax01, j1By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j1Ax34, j1By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j1Ax34, j1By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j1Ax32, j1By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j1Ax32, j1By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j1Ax35, j1By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j1Ax35, j1By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j1Ax33, j1By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j1Ax33, j1By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j1Ax35, j1By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j1Ax35, j1By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j1Ax33, j1By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j1Ax33, j1By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j1Ax34, j1By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j1Ax34, j1By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j1Ax32, j1By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j1Ax32, j1By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j1Ax03, j1By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j1Ax03, j1By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j1Ax01, j1By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j1Ax01, j1By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j1Ax02, j1By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j1Ax02, j1By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j1Ax00, j1By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j1Ax00, j1By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<3*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<3*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax32, [readAs + 4x<3*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By32, [readBs + 4x<3*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j0Ax34, j0By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j0Ax34, j0By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j0Ax32, j0By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j0Ax32, j0By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j0Ax35, j0By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j0Ax35, j0By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j0Ax33, j0By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j0Ax33, j0By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j0Ax35, j0By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j0Ax35, j0By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j0Ax33, j0By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j0Ax33, j0By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j0Ax34, j0By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j0Ax34, j0By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j0Ax32, j0By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j0Ax32, j0By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j0Ax02, j0By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j0Ax02, j0By33, cx02y33;
+--:-:-:-:1      FFMA cx00y33, j0Ax00, j0By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j0Ax00, j0By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j0Ax03, j0By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j0Ax03, j0By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j0Ax01, j0By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j0Ax01, j0By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j0Ax34, j0By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j0Ax34, j0By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j0Ax32, j0By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j0Ax32, j0By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j0Ax35, j0By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j0Ax35, j0By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j0Ax33, j0By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j0Ax33, j0By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j0Ax35, j0By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j0Ax35, j0By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j0Ax33, j0By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j0Ax33, j0By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j0Ax34, j0By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j0Ax34, j0By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j0Ax32, j0By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j0Ax32, j0By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j0Ax03, j0By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j0Ax03, j0By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j0Ax01, j0By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j0Ax01, j0By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j0Ax02, j0By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j0Ax02, j0By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j0Ax00, j0By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j0Ax00, j0By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<4*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<4*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<4*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<4*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j1Ax34, j1By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j1Ax34, j1By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j1Ax32, j1By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j1Ax32, j1By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j1Ax35, j1By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j1Ax35, j1By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j1Ax33, j1By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j1Ax33, j1By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j1Ax35, j1By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j1Ax35, j1By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j1Ax33, j1By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j1Ax33, j1By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j1Ax34, j1By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j1Ax34, j1By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j1Ax32, j1By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j1Ax32, j1By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j1Ax02, j1By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j1Ax02, j1By33, cx02y33;
+--:-:-:-:1      FFMA cx00y33, j1Ax00, j1By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j1Ax00, j1By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j1Ax03, j1By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j1Ax03, j1By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j1Ax01, j1By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j1Ax01, j1By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j1Ax34, j1By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j1Ax34, j1By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j1Ax32, j1By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j1Ax32, j1By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j1Ax35, j1By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j1Ax35, j1By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j1Ax33, j1By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j1Ax33, j1By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j1Ax35, j1By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j1Ax35, j1By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j1Ax33, j1By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j1Ax33, j1By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j1Ax34, j1By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j1Ax34, j1By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j1Ax32, j1By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j1Ax32, j1By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j1Ax03, j1By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j1Ax03, j1By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j1Ax01, j1By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j1Ax01, j1By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j1Ax02, j1By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j1Ax02, j1By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j1Ax00, j1By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j1Ax00, j1By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<5*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<5*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax32, [readAs + 4x<5*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By32, [readBs + 4x<5*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j0Ax34, j0By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j0Ax34, j0By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j0Ax32, j0By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j0Ax32, j0By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j0Ax35, j0By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j0Ax35, j0By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j0Ax33, j0By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j0Ax33, j0By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j0Ax35, j0By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j0Ax35, j0By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j0Ax33, j0By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j0Ax33, j0By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j0Ax34, j0By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j0Ax34, j0By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j0Ax32, j0By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j0Ax32, j0By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j0Ax02, j0By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j0Ax02, j0By33, cx02y33;
+--:-:-:-:1      FFMA cx00y33, j0Ax00, j0By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j0Ax00, j0By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j0Ax03, j0By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j0Ax03, j0By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j0Ax01, j0By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j0Ax01, j0By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j0Ax34, j0By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j0Ax34, j0By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j0Ax32, j0By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j0Ax32, j0By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j0Ax35, j0By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j0Ax35, j0By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j0Ax33, j0By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j0Ax33, j0By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j0Ax35, j0By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j0Ax35, j0By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j0Ax33, j0By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j0Ax33, j0By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j0Ax34, j0By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j0Ax34, j0By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j0Ax32, j0By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j0Ax32, j0By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j0Ax03, j0By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j0Ax03, j0By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j0Ax01, j0By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j0Ax01, j0By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j0Ax02, j0By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j0Ax02, j0By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j0Ax00, j0By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j0Ax00, j0By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<6*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<6*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<6*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<6*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j1Ax34, j1By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j1Ax34, j1By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j1Ax32, j1By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j1Ax32, j1By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j1Ax35, j1By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j1Ax35, j1By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j1Ax33, j1By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j1Ax33, j1By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j1Ax35, j1By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j1Ax35, j1By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j1Ax33, j1By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j1Ax33, j1By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j1Ax34, j1By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j1Ax34, j1By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j1Ax32, j1By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j1Ax32, j1By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:0      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+02:-:-:-:1  @P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j1Ax02, j1By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j1Ax02, j1By33, cx02y33;
+--:-:-:-:0      FFMA cx00y33, j1Ax00, j1By33, cx00y33;
+--:-:-:-:1  @P0 STS.128 [writeS + 4x<2*64>], loadX2;
+--:-:-:-:1      FFMA cx00y32, j1Ax00, j1By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j1Ax03, j1By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j1Ax03, j1By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j1Ax01, j1By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j1Ax01, j1By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j1Ax34, j1By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j1Ax34, j1By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j1Ax32, j1By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j1Ax32, j1By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j1Ax35, j1By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j1Ax35, j1By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j1Ax33, j1By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j1Ax33, j1By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j1Ax35, j1By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j1Ax35, j1By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j1Ax33, j1By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j1Ax33, j1By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j1Ax34, j1By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j1Ax34, j1By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j1Ax32, j1By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j1Ax32, j1By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j1Ax03, j1By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j1Ax03, j1By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j1Ax01, j1By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j1Ax01, j1By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j1Ax02, j1By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j1Ax02, j1By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j1Ax00, j1By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j1Ax00, j1By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<7*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<7*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax32, [readAs + 4x<7*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By32, [readBs + 4x<7*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j0Ax34, j0By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j0Ax34, j0By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j0Ax32, j0By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j0Ax32, j0By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j0Ax35, j0By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j0Ax35, j0By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j0Ax33, j0By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j0Ax33, j0By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j0Ax35, j0By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j0Ax35, j0By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j0Ax33, j0By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j0Ax33, j0By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j0Ax34, j0By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j0Ax34, j0By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j0Ax32, j0By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j0Ax32, j0By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:0      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+04:-:-:-:1  @P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j0Ax02, j0By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j0Ax02, j0By33, cx02y33;
+--:-:-:-:0      FFMA cx00y33, j0Ax00, j0By33, cx00y33;
+--:-:-:-:1  @P0 STS.128 [writeS + 4x<6*64>], loadX6;
+--:-:-:-:1      FFMA cx00y32, j0Ax00, j0By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j0Ax03, j0By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j0Ax03, j0By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j0Ax01, j0By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j0Ax01, j0By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j0Ax34, j0By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j0Ax34, j0By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j0Ax32, j0By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j0Ax32, j0By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j0Ax35, j0By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j0Ax35, j0By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j0Ax33, j0By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j0Ax33, j0By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j0Ax35, j0By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j0Ax35, j0By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j0Ax33, j0By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j0Ax33, j0By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j0Ax34, j0By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j0Ax34, j0By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j0Ax32, j0By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j0Ax32, j0By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j0Ax03, j0By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j0Ax03, j0By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j0Ax01, j0By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j0Ax01, j0By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j0Ax02, j0By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j0Ax02, j0By35, cx02y35;
+--:-:-:-:0      FFMA cx00y35, j0Ax00, j0By35, cx00y35;
+01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1
+--:-:-:-:1  @P0 LOP.XOR readAs, readAs, 4x<16*64>;
+--:-:-:-:1  @P0 LOP.XOR readBs, readBs, 4x<16*64>;
+--:-:-:-:1  @P0 LOP.XOR writeS, writeS, 4x<16*64>;
+--:-:-:-:1      FFMA cx00y34, j0Ax00, j0By34, cx00y34;
+--:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00;
+--:-:-:-:1  @P0 LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1  @P0 LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1  @P0 LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1  @P0 LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j1Ax34, j1By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j1Ax34, j1By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j1Ax32, j1By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j1Ax32, j1By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j1Ax35, j1By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j1Ax35, j1By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j1Ax33, j1By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j1Ax33, j1By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j1Ax35, j1By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j1Ax35, j1By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j1Ax33, j1By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j1Ax33, j1By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j1Ax34, j1By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j1Ax34, j1By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j1Ax32, j1By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j1Ax32, j1By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j1Ax02, j1By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j1Ax02, j1By33, cx02y33;
+--:-:-:-:1      FFMA cx00y33, j1Ax00, j1By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j1Ax00, j1By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j1Ax03, j1By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j1Ax03, j1By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j1Ax01, j1By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j1Ax01, j1By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j1Ax34, j1By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j1Ax34, j1By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j1Ax32, j1By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j1Ax32, j1By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j1Ax35, j1By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j1Ax35, j1By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j1Ax33, j1By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j1Ax33, j1By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j1Ax35, j1By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j1Ax35, j1By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j1Ax33, j1By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j1Ax33, j1By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j1Ax34, j1By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j1Ax34, j1By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j1Ax32, j1By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j1Ax32, j1By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j1Ax03, j1By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j1Ax03, j1By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j1Ax01, j1By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j1Ax01, j1By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j1Ax02, j1By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j1Ax02, j1By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j1Ax00, j1By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j1Ax00, j1By34, cx00y34;
+--:-:-:-:1  @P0 IADD track0, track0, ldx8;
+--:-:-:-:1  @P0 IADD track2, track2, ldx8;
+--:-:-:-:1  @P0 IADD track4, track4, ldx8;
+--:-:-:-:0  @P0 IADD track6, track6, ldx8;
+--:-:-:Y:5  @P0 BRA LOOP;
+
+--:-:-:-:1      SHR.U32 cy00, tid32, 1;
+--:-:-:-:1      MOV ldc, c[0x0][0x158];
+--:-:-:-:1      ISCADD cx, bx, tid31, 6;
+--:-:-:-:1      MOV alpha, c[0x0][0x15c];
+--:-:-:-:1      ISCADD readCs, tid32, tid31, 3;
+--:-:-:-:1      LOP.AND readAs, readAs, 0x7ff;
+--:-:-:-:1      ISCADD cy00, by, cy00, 6;
+--:-:-:-:1      LOP.AND readBs, readBs, 0x7ff;
+--:-:-:-:1      SHL ldc1, ldc, 2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
+--:-:-:-:1      SHL ldc4, ldc, 4;
+--:-:-:-:1      FMUL cs0, cx00y00, alpha;
+--:-:-:-:1      SHL ldc8, ldc, 5;
+--:-:-:-:1      XMAD.MRG xmad_ci, cy00, ldc.H1, RZ;
+--:-:-:-:1      ISCADD writeCs, readBs, readAs, 4;
+--:-:-:-:1      XMAD ci, cy00, ldc, cx;
+--:-:-:-:1      SHL readCs, readCs, 2;
+--:-:-:-:1      IADD cx, cx, 32;
+--:-:-:-:1      ISCADD ldc28, ldc, -ldc4, 7;
+--:-:-:-:1      FMUL cs1, cx01y00, alpha;
+--:-:-:-:1      FMUL cs2, cx02y00, alpha;
+--:-:-:-:1      XMAD.PSL.CBCC ci, cy00.H1, xmad_ci.H1, ci;
+--:-:-:-:1      IADD cy00, cy00, -1;
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
+--:-:-:-:1      FMUL cs3, cx03y00, alpha;
+--:-:-:-:1      FMUL cs4, cx32y00, alpha;
+--:-:-:-:1      FMUL cs5, cx33y00, alpha;
+--:-:-:-:1      ISCADD Cy00, ci, c[0x0][0x140], 2;
+--:-:-:-:1      IADD cy04, cy00, 4;
+--:-:-:-:1      IADD cy08, cy00, 8;
+--:-:-:-:1      IADD cy12, cy00, 12;
+--:-:-:-:1      FMUL cs6, cx34y00, alpha;
+--:-:-:-:1      FMUL cs7, cx35y00, alpha;
+--:-:-:Y:6      IADD Cy00, Cy00, -ldc1;
+--:-:-:-:1      IADD Cy04, Cy00, ldc4;
+--:-:-:Y:5      IADD Cy08, Cy00, ldc8;
+--:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
+
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y01, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y01, alpha;
+--:-:-:-:1      FMUL cs2, cx02y01, alpha;
+--:-:-:-:1      FMUL cs3, cx03y01, alpha;
+--:-:-:-:1      FMUL cs4, cx32y01, alpha;
+--:-:-:-:1      FMUL cs5, cx33y01, alpha;
+--:-:-:-:1      FMUL cs6, cx34y01, alpha;
+--:-:-:-:0      FMUL cs7, cx35y01, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y02, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y02, alpha;
+--:-:-:-:1      FMUL cs2, cx02y02, alpha;
+--:-:-:-:1      FMUL cs3, cx03y02, alpha;
+--:-:-:-:1      FMUL cs4, cx32y02, alpha;
+--:-:-:-:1      FMUL cs5, cx33y02, alpha;
+--:-:-:-:1      FMUL cs6, cx34y02, alpha;
+--:-:-:-:0      FMUL cs7, cx35y02, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y03, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y03, alpha;
+--:-:-:-:1      FMUL cs2, cx02y03, alpha;
+--:-:-:-:1      FMUL cs3, cx03y03, alpha;
+--:-:-:-:1      FMUL cs4, cx32y03, alpha;
+--:-:-:-:1      FMUL cs5, cx33y03, alpha;
+--:-:-:-:1      FMUL cs6, cx34y03, alpha;
+--:-:-:-:0      FMUL cs7, cx35y03, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      IADD cy00, cy00, 28;
+--:-:-:-:1      IADD cy04, cy04, 28;
+--:-:-:-:1      IADD cy08, cy08, 28;
+--:-:-:-:1      IADD cy12, cy12, 28;
+
+02:-:-:-:1      IADD Cy00, Cy00, ldc28; // Wait Dep 2
+--:-:-:-:1      IADD Cy04, Cy04, ldc28;
+--:-:-:-:1      IADD Cy08, Cy08, ldc28;
+--:-:-:-:1      IADD Cy12, Cy12, ldc28;
+
+--:-:-:-:1      FMUL cs0, cx00y32, alpha;
+--:-:-:-:1      FMUL cs1, cx01y32, alpha;
+--:-:-:-:1      FMUL cs2, cx02y32, alpha;
+--:-:-:-:1      FMUL cs3, cx03y32, alpha;
+--:-:-:-:1      FMUL cs4, cx32y32, alpha;
+--:-:-:-:1      FMUL cs5, cx33y32, alpha;
+--:-:-:-:1      FMUL cs6, cx34y32, alpha;
+--:-:-:-:0      FMUL cs7, cx35y32, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y33, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y33, alpha;
+--:-:-:-:1      FMUL cs2, cx02y33, alpha;
+--:-:-:-:1      FMUL cs3, cx03y33, alpha;
+--:-:-:-:1      FMUL cs4, cx32y33, alpha;
+--:-:-:-:1      FMUL cs5, cx33y33, alpha;
+--:-:-:-:1      FMUL cs6, cx34y33, alpha;
+--:-:-:-:0      FMUL cs7, cx35y33, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y34, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y34, alpha;
+--:-:-:-:1      FMUL cs2, cx02y34, alpha;
+--:-:-:-:1      FMUL cs3, cx03y34, alpha;
+--:-:-:-:1      FMUL cs4, cx32y34, alpha;
+--:-:-:-:1      FMUL cs5, cx33y34, alpha;
+--:-:-:-:1      FMUL cs6, cx34y34, alpha;
+--:-:-:-:0      FMUL cs7, cx35y34, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y35, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y35, alpha;
+--:-:-:-:1      FMUL cs2, cx02y35, alpha;
+--:-:-:-:1      FMUL cs3, cx03y35, alpha;
+--:-:-:-:1      FMUL cs4, cx32y35, alpha;
+--:-:-:-:1      FMUL cs5, cx33y35, alpha;
+--:-:-:-:1      FMUL cs6, cx34y35, alpha;
+--:-:-:-:0      FMUL cs7, cx35y35, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:0      IADD cy00, cy00, 1;
+--:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
+--:-:-:-:0      IADD cy04, cy04, 1;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], cs4;
+--:-:-:-:0      IADD cy08, cy08, 1;
+--:-:-:-:1      LDS cs0, [readCs + 4x<0*64 + 00>];
+--:-:-:-:0      IADD cy12, cy12, 1;
+--:-:-:-:1      LDS cs1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:0      IADD Cy00, Cy00, ldc1;
+--:-:-:-:1      LDS cs2, [readCs + 4x<1*64 + 00>];
+--:-:-:-:0      IADD Cy04, Cy04, ldc1;
+--:-:-:-:1      LDS cs3, [readCs + 4x<1*64 + 32>];
+--:-:-:-:0      IADD Cy08, Cy08, ldc1;
+--:-:-:-:1      LDS cs4, [readCs + 4x<2*64 + 00>];
+--:-:-:-:0      IADD Cy12, Cy12, ldc1;
+--:-:-:-:1      LDS cs5, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
+--:-:-:-:1      LDS cs6, [readCs + 4x<3*64 + 00>];
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m
+--:-:1:-:1      LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1
+--:-:-:-:2      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
+--:-:-:Y:7      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m
+01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
+--:-:-:-:1  @P1 STG.CG [Cy00 + 4x<32>], cs1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m
+--:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
+--:-:-:-:1  @P3 STG.CG [Cy04 + 4x<32>], cs3;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m
+--:-:-:-:2  @P0 STG.CG [Cy08 + 4x<00>], cs4;
+--:-:-:-:2  @P1 STG.CG [Cy08 + 4x<32>], cs5;
+--:-:-:-:2  @P2 STG.CG [Cy12 + 4x<00>], cs6;
+--:2:-:-:1  @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2
+
+--:-:-:-:5      RET;
+
diff --git a/Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin b/Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin
new file mode 100644
index 0000000..0c7825f
Binary files /dev/null and b/Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin differ
diff --git a/Assembler/MaxAs/sgemm/sgemm_sm52_64_dump.sass b/Assembler/MaxAs/sgemm/sgemm_sm52_64_dump.sass
new file mode 100644
index 0000000..552d95b
--- /dev/null
+++ b/Assembler/MaxAs/sgemm/sgemm_sm52_64_dump.sass
@@ -0,0 +1,1100 @@
+
+	code for sm_52
+		Function : sgemm_kernel_128
+	.headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
+                                                                                        /* 0x001ffc00e22007f6 */
+        /*0008*/                   MOV R1, c[0x0][0x20];                                /* 0x4c98078000870001 */
+        /*0010*/                   S2R R8, SR_TID.X;                                    /* 0xf0c8000002170008 */
+        /*0018*/                   SSY 0x90;                                            /* 0xe290000007000000 */
+                                                                                        /* 0x001fc400ffa00fed */
+        /*0028*/                   ISETP.GT.AND P0, PT, R8, 0x7f, PT;                   /* 0x3669038007f70807 */
+        /*0030*/              @!P0 BRA 0x60;                                            /* 0xe24000000288000f */
+        /*0038*/                   MOV R0, c[0x0][0x170];                               /* 0x4c98078005c70000 */
+                                                                                        /* 0x001ff400fe0007f5 */
+        /*0048*/                   MOV32I R3, 0x20000000;                               /* 0x010200000007f003 */
+        /*0050*/         {         LOP32I.OR R2, R0, 0x80000000;                        /* 0x0428000000070002 */
+        /*0058*/                   SYNC;        }                                       /* 0xf0f800000007000f */
+                                                                                        /* 0x001fc000fea007f1 */
+        /*0068*/                   MOV R0, c[0x0][0x174];                               /* 0x4c98078005d70000 */
+        /*0070*/                   MOV32I R3, 0x20000000;                               /* 0x010200000007f003 */
+        /*0078*/         {         LOP32I.OR R2, R0, 0x80000000;                        /* 0x0428000000070002 */
+        /*0088*/                   SYNC;        }                                       /* 0x001fd0800e2007fd */
+                                                                                        /* 0xf0f800000007000f */
+        /*0090*/                   TLD.B.LZ.NODEP.P R4, R8, R2, 0x0, 1D, 0xf;           /* 0xdd3a000780270804 */
+        /*0098*/                   SHL R0, R8, 0x4;                                     /* 0x3848000000470800 */
+                                                                                        /* 0x081fc403ffe041f2 */
+        /*00a8*/                   STS.128 [R0], R4;                                    /* 0xef5e000000070004 */
+        /*00b0*/                   BAR.SYNC 0x0;                                        /* 0xf0a81b8000070000 */
+        /*00b8*/                   IADD32I R0, -R8.reuse, 0xff;                         /* 0x1d0000000ff70800 */
+                                                                                        /* 0x001fc000fe8207f5 */
+        /*00c8*/                   SHL R2, R8.reuse, 0x2;                               /* 0x3848000000270802 */
+        /*00d0*/                   SHL R0, R0, 0x4;                                     /* 0x3848000000470000 */
+        /*00d8*/         {         IADD R4.CC, R2, c[0x0][0x140];                       /* 0x4c10800005070204 */
+        /*00e8*/                   LDS.U.32 R0, [R0];        }                          /* 0x001fc400fec00711 */
+                                                                                        /* 0xef4c100000070000 */
+        /*00f0*/                   SHR R2, R8, 0x1e;                                    /* 0x3829000001e70802 */
+        /*00f8*/                   IADD.X R3, R2, c[0x0][0x144];                        /* 0x4c10080005170203 */
+                                                                                        /* 0x001ffc011e2007ff */
+        /*0108*/                   MOV R2, R4;                                          /* 0x5c98078000470002 */
+        /*0110*/                   STG.E [R2], R0;                                      /* 0xeedc200000070200 */
+        /*0118*/                   EXIT;                                                /* 0xe30000000007000f */
+                                                                                        /* 0x001f8000fc0007ff */
+        /*0128*/                   BRA 0x120;                                           /* 0xe2400fffff07000f */
+        /*0130*/                   NOP;                                                 /* 0x50b0000000070f00 */
+        /*0138*/                   NOP;                                                 /* 0x50b0000000070f00 */
+		.................................
+
+
+		Function : sgemm_kernel_64
+	.headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
+                                                                                                /* 0x001d4400e6200711 */
+        /*0008*/                   S2R R119, SR_TID.X;                                          /* 0xf0c8000002170077 */
+        /*0010*/                   S2R R125, SR_CTAID.X;                                        /* 0xf0c800000257007d */
+        /*0018*/                   S2R R122, SR_CTAID.Y;                                        /* 0xf0c800000267007a */
+                                                                                                /* 0x081fc440fe220ff1 */
+        /*0028*/                   ISETP.GE.AND P0, PT, R119.reuse, 0x20, PT;                   /* 0x366d038002077707 */
+        /*0030*/                   LOP.AND R9, R119.reuse, 0xf;                                 /* 0x3847000000f77709 */
+        /*0038*/                   BFE.U32 R4, R119.reuse, 0x104;                               /* 0x3800000010477704 */
+                                                                                                /* 0x081fc440fe2007f1 */
+        /*0048*/                   MOV R12, c[0x0][0x14c];                                      /* 0x4c9807800537000c */
+        /*0050*/                   BFE.U32 R114, R119.reuse, 0x301;                             /* 0x3800000030177772 */
+        /*0058*/                   LOP.AND R115, R119.reuse, 0x30;                              /* 0x3847000003077773 */
+                                                                                                /* 0x081fc400fe2207f1 */
+        /*0068*/                   LOP.AND R0, R119.reuse, 0x1;                                 /* 0x3847000000177700 */
+        /*0070*/                   SHL R13, R9, 0x4;                                            /* 0x384800000047090d */
+        /*0078*/                   LOP.AND R80, R119.reuse, -0x20;                              /* 0x3947007ffe077750 */
+                                                                                                /* 0x081fc400fe2007f1 */
+        /*0088*/                   IADD R12, R12, -0x8;                                         /* 0x3910007fff870c0c */
+        /*0090*/                   SHL R114, R114, 0x4;                                         /* 0x3848000000477272 */
+        /*0098*/                   LOP.AND R126, R119.reuse, 0x1f;                              /* 0x3847000001f7777e */
+                                                                                                /* 0x001fc400fe2007f0 */
+        /*00a8*/         {         SHR.U32 R115, R115, 0x3;                                     /* 0x3828000000377373 */
+        /*00b0*/                   STS.128 [R80+0x1000], RZ;        }                           /* 0xef5e0001000750ff */
+        /*00b8*/              @!P0 MOV R2, c[0x0][0x150];                                       /* 0x4c98078005480002 */
+                                                                                                /* 0x00dfc400fe2007f1 */
+        /*00c8*/                   ISCADD R118, R4, R13, 0x8;                                   /* 0x5c18040000d70476 */
+        /*00d0*/               @P0 MOV R2, c[0x0][0x154];                                       /* 0x4c98078005500002 */
+        /*00d8*/                   SEL R8, R122, R125, P0;                                      /* 0x5ca0000007d77a08 */
+                                                                                                /* 0x001fc400fe2007f1 */
+        /*00e8*/              @!P0 MOV32I R113, 0x80000001;                                     /* 0x010800000018f071 */
+        /*00f0*/               @P0 MOV32I R113, 0x80000000;                                     /* 0x010800000000f071 */
+        /*00f8*/                   LOP.OR R115, R115, R0;                                       /* 0x5c47020000077373 */
+                                                                                                /* 0x001fc440fe2007f1 */
+        /*0108*/                   LOP.AND R123, R119, 0x20;                                    /* 0x384700000207777b */
+        /*0110*/                   SHR.U32 R1, R2.reuse, 0x2;                                   /* 0x3828000000270201 */
+        /*0118*/                   IADD R121, R2, R2;                                           /* 0x5c10000000270279 */
+                                                                                                /* 0x001fc800fe2007f1 */
+        /*0128*/                   ISCADD R112, R8, R9, 0x4;                                    /* 0x5c18020000970870 */
+        /*0130*/               @P0 IADD R118, R118, 0x800;                                      /* 0x3810000080007676 */
+        /*0138*/                   ISCADD R115, R115, 0x800, 0x4;                               /* 0x3818020080077373 */
+                                                                                                /* 0x081f98c0fe2607f1 */
+        /*0148*/                   XMAD.MRG R5, R1.reuse, R4.H1.reuse, RZ;                      /* 0x5b007fa800470105 */
+        /*0150*/                   XMAD.MRG R16, R12.reuse, R1.H1.reuse, RZ;                    /* 0x5b007fa800170c10 */
+        /*0158*/                   XMAD R112, R1.reuse, R4, R112;                               /* 0x5b00380000470170 */
+                                                                                                /* 0x181fc480e28007f2 */
+        /*0168*/                   XMAD.PSL.CBCC R112, R1.H1, R5.H1, R112;                      /* 0x5b30381800570170 */
+        /*0170*/                   TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf;                    /* 0xdd38000787177060 */
+        /*0178*/                   IADD3 R116, R112.reuse, R1.reuse, R1;                        /* 0x5cc0008000177074 */
+                                                                                                /* 0x081fc080e62407f1 */
+        /*0188*/                   IADD R120, R112, R2.reuse;                                   /* 0x5c10000000277078 */
+        /*0190*/                   TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf;                   /* 0xdd38000787177464 */
+        /*0198*/         {         XMAD R117, R12.reuse, R1, R112;                              /* 0x5b00380000170c75 */
+        /*01a8*/                   TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf;        }          /* 0x101dc400fe440753 */
+                                                                                                /* 0xdd38000787177868 */
+        /*01b0*/                   IADD R124, R116, R2;                                         /* 0x5c1000000027747c */
+        /*01b8*/                   TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf;                   /* 0xdd38000787177c6c */
+                                                                                                /* 0x001e4400f22007f1 */
+        /*01c8*/                   XMAD.PSL.CBCC R117, R12.H1, R16.H1, R117;                    /* 0x5b303a9801070c75 */
+        /*01d0*/                   LDS.U.128 R0, [R80+0x1000];                                  /* 0xef4e100100075000 */
+        /*01d8*/                   LDS.U.128 R4, [R80+0x1000];                                  /* 0xef4e100100075004 */
+                                                                                                /* 0x001e4400f2200791 */
+        /*01e8*/                   LDS.U.128 R8, [R80+0x1000];                                  /* 0xef4e100100075008 */
+        /*01f0*/                   LDS.U.128 R12, [R80+0x1000];                                 /* 0xef4e10010007500c */
+        /*01f8*/                   LDS.U.128 R16, [R80+0x1000];                                 /* 0xef4e100100075010 */
+                                                                                                /* 0x001e4400f2200791 */
+        /*0208*/                   LDS.U.128 R20, [R80+0x1000];                                 /* 0xef4e100100075014 */
+        /*0210*/                   LDS.U.128 R24, [R80+0x1000];                                 /* 0xef4e100100075018 */
+        /*0218*/                   LDS.U.128 R28, [R80+0x1000];                                 /* 0xef4e10010007501c */
+                                                                                                /* 0x001e4400f2200791 */
+        /*0228*/                   LDS.U.128 R32, [R80+0x1000];                                 /* 0xef4e100100075020 */
+        /*0230*/                   LDS.U.128 R36, [R80+0x1000];                                 /* 0xef4e100100075024 */
+        /*0238*/                   LDS.U.128 R40, [R80+0x1000];                                 /* 0xef4e100100075028 */
+                                                                                                /* 0x001e4400f2200791 */
+        /*0248*/                   LDS.U.128 R44, [R80+0x1000];                                 /* 0xef4e10010007502c */
+        /*0250*/                   LDS.U.128 R48, [R80+0x1000];                                 /* 0xef4e100100075030 */
+        /*0258*/                   LDS.U.128 R52, [R80+0x1000];                                 /* 0xef4e100100075034 */
+                                                                                                /* 0x003fc400f2200791 */
+        /*0268*/                   LDS.U.128 R56, [R80+0x1000];                                 /* 0xef4e100100075038 */
+        /*0270*/                   LDS.U.128 R60, [R80+0x1000];                                 /* 0xef4e10010007503c */
+        /*0278*/                   STS.128 [R118], R96;                                         /* 0xef5e000000077660 */
+                                                                                                /* 0x101fc002fe2407f0 */
+        /*0288*/         {         IADD R112, R112, R121.reuse;                                 /* 0x5c10000007977070 */
+        /*0290*/                   STS.128 [R118+0x200], R100;        }                         /* 0xef5e000020077664 */
+        /*0298*/         {         IADD R116, R116, R121.reuse;                                 /* 0x5c10000007977474 */
+        /*02a8*/                   STS.128 [R118+0x400], R104;        }                         /* 0x011fc480fe0027f1 */
+                                                                                                /* 0xef5e000040077668 */
+        /*02b0*/         {         IADD R120, R120, R121.reuse;                                 /* 0x5c10000007977878 */
+        /*02b8*/                   STS.128 [R118+0x600], R108;        }                         /* 0xef5e00006007766c */
+                                                                                                /* 0x001fc010fea007f0 */
+        /*02c8*/         {         IADD R124, R124, R121;                                       /* 0x5c10000007977c7c */
+        /*02d0*/                   BAR.SYNC 0x0;        }                                       /* 0xf0a81b8000070000 */
+        /*02d8*/         {         LOP.XOR R118, R118, 0x1000;                                  /* 0x3847040100077676 */
+        /*02e8*/                   LDS.U.128 R64, [R114];        }                              /* 0x001fc400fe2007f1 */
+                                                                                                /* 0xef4e100000077240 */
+        /*02f0*/                   LDS.U.128 R72, [R115];                                       /* 0xef4e100000077348 */
+        /*02f8*/                   LDS.U.128 R68, [R114+0x80];                                  /* 0xef4e100008077244 */
+                                                                                                /* 0x183fc000fe200711 */
+        /*0308*/                   LDS.U.128 R76, [R115+0x80];                                  /* 0xef4e10000807734c */
+        /*0310*/                   ISETP.LE.AND P0, PT, R112, R117, PT;                         /* 0x5b67038007577007 */
+        /*0318*/         {         FFMA R1, R66.reuse, R72.reuse, R1;                           /* 0x5980008004874201 */
+        /*0328*/                   LDS.U.128 R80, [R114+0x100];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100010077250 */
+        /*0330*/                   FFMA R0, R66, R73.reuse, R0;                                 /* 0x5980000004974200 */
+        /*0338*/         {         FFMA R2, R64.reuse, R73.reuse, R2;                           /* 0x5980010004974002 */
+        /*0348*/                   LDS.U.128 R88, [R115+0x100];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100010077358 */
+        /*0350*/                   FFMA R3, R64, R72.reuse, R3;                                 /* 0x5980018004874003 */
+        /*0358*/         {         FFMA R5, R67.reuse, R72.reuse, R5;                           /* 0x5980028004874305 */
+        /*0368*/                   LDS.U.128 R84, [R114+0x180];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100018077254 */
+        /*0370*/                   FFMA R4, R67, R73.reuse, R4;                                 /* 0x5980020004974304 */
+        /*0378*/         {         FFMA R6, R65.reuse, R73.reuse, R6;                           /* 0x5980030004974106 */
+        /*0388*/                   LDS.U.128 R92, [R115+0x180];        }                        /* 0x181fc480fe200711 */
+                                                                                                /* 0xef4e10001807735c */
+        /*0390*/                   FFMA R7, R65, R72.reuse, R7;                                 /* 0x5980038004874107 */
+        /*0398*/                   FFMA R33, R70.reuse, R72.reuse, R33;                         /* 0x5980108004874621 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*03a8*/                   FFMA R32, R70, R73.reuse, R32;                               /* 0x5980100004974620 */
+        /*03b0*/                   FFMA R34, R68.reuse, R73.reuse, R34;                         /* 0x5980110004974422 */
+        /*03b8*/                   FFMA R35, R68, R72.reuse, R35;                               /* 0x5980118004874423 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*03c8*/                   FFMA R37, R71.reuse, R72.reuse, R37;                         /* 0x5980128004874725 */
+        /*03d0*/                   FFMA R36, R71.reuse, R73.reuse, R36;                         /* 0x5980120004974724 */
+        /*03d8*/                   FFMA R38, R69.reuse, R73, R38;                               /* 0x5980130004974526 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*03e8*/                   FFMA R39, R69.reuse, R72, R39;                               /* 0x5980138004874527 */
+        /*03f0*/                   FFMA R45, R71.reuse, R74.reuse, R45;                         /* 0x5980168004a7472d */
+        /*03f8*/                   FFMA R44, R71, R75.reuse, R44;                               /* 0x5980160004b7472c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0408*/                   FFMA R46, R69.reuse, R75.reuse, R46;                         /* 0x5980170004b7452e */
+        /*0410*/                   FFMA R47, R69, R74.reuse, R47;                               /* 0x5980178004a7452f */
+        /*0418*/                   FFMA R41, R70.reuse, R74.reuse, R41;                         /* 0x5980148004a74629 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0428*/                   FFMA R40, R70, R75.reuse, R40;                               /* 0x5980140004b74628 */
+        /*0430*/                   FFMA R42, R68.reuse, R75.reuse, R42;                         /* 0x5980150004b7442a */
+        /*0438*/                   FFMA R43, R68, R74.reuse, R43;                               /* 0x5980158004a7442b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0448*/                   FFMA R13, R67.reuse, R74.reuse, R13;                         /* 0x5980068004a7430d */
+        /*0450*/                   FFMA R12, R67, R75.reuse, R12;                               /* 0x5980060004b7430c */
+        /*0458*/                   FFMA R14, R65.reuse, R75.reuse, R14;                         /* 0x5980070004b7410e */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0468*/                   FFMA R15, R65, R74.reuse, R15;                               /* 0x5980078004a7410f */
+        /*0470*/                   FFMA R9, R66.reuse, R74.reuse, R9;                           /* 0x5980048004a74209 */
+        /*0478*/                   FFMA R8, R66.reuse, R75.reuse, R8;                           /* 0x5980040004b74208 */
+                                                                                                /* 0x101fc440fe0207f1 */
+        /*0488*/                   FFMA R10, R64.reuse, R75, R10;                               /* 0x5980050004b7400a */
+        /*0490*/         {         FFMA R11, R64.reuse, R74, R11;                               /* 0x5980058004a7400b */
+        /*0498*/               @P0 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf;        }           /* 0xdd38000787107060 */
+                                                                                                /* 0x101cc480fe0607e1 */
+        /*04a8*/                   FFMA R17, R66.reuse, R76.reuse, R17;                         /* 0x5980088004c74211 */
+        /*04b0*/         {         FFMA R16, R66, R77.reuse, R16;                               /* 0x5980080004d74210 */
+        /*04b8*/               @P0 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf;        }          /* 0xdd38000787107464 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*04c8*/                   FFMA R18, R64.reuse, R77.reuse, R18;                         /* 0x5980090004d74012 */
+        /*04d0*/                   FFMA R19, R64, R76.reuse, R19;                               /* 0x5980098004c74013 */
+        /*04d8*/                   FFMA R21, R67.reuse, R76.reuse, R21;                         /* 0x59800a8004c74315 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*04e8*/                   FFMA R20, R67, R77.reuse, R20;                               /* 0x59800a0004d74314 */
+        /*04f0*/                   FFMA R22, R65.reuse, R77.reuse, R22;                         /* 0x59800b0004d74116 */
+        /*04f8*/                   FFMA R23, R65, R76.reuse, R23;                               /* 0x59800b8004c74117 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0508*/                   FFMA R49, R70.reuse, R76.reuse, R49;                         /* 0x5980188004c74631 */
+        /*0510*/                   FFMA R48, R70, R77.reuse, R48;                               /* 0x5980180004d74630 */
+        /*0518*/                   FFMA R50, R68.reuse, R77.reuse, R50;                         /* 0x5980190004d74432 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0528*/                   FFMA R51, R68, R76.reuse, R51;                               /* 0x5980198004c74433 */
+        /*0530*/                   FFMA R53, R71.reuse, R76.reuse, R53;                         /* 0x59801a8004c74735 */
+        /*0538*/                   FFMA R52, R71.reuse, R77.reuse, R52;                         /* 0x59801a0004d74734 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*0548*/                   FFMA R54, R69.reuse, R77, R54;                               /* 0x59801b0004d74536 */
+        /*0550*/                   FFMA R55, R69.reuse, R76, R55;                               /* 0x59801b8004c74537 */
+        /*0558*/                   FFMA R61, R71.reuse, R78.reuse, R61;                         /* 0x59801e8004e7473d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0568*/                   FFMA R60, R71, R79.reuse, R60;                               /* 0x59801e0004f7473c */
+        /*0570*/                   FFMA R62, R69.reuse, R79.reuse, R62;                         /* 0x59801f0004f7453e */
+        /*0578*/                   FFMA R63, R69, R78.reuse, R63;                               /* 0x59801f8004e7453f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0588*/                   FFMA R57, R70.reuse, R78.reuse, R57;                         /* 0x59801c8004e74639 */
+        /*0590*/                   FFMA R56, R70, R79.reuse, R56;                               /* 0x59801c0004f74638 */
+        /*0598*/                   FFMA R58, R68.reuse, R79.reuse, R58;                         /* 0x59801d0004f7443a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*05a8*/                   FFMA R59, R68, R78.reuse, R59;                               /* 0x59801d8004e7443b */
+        /*05b0*/                   FFMA R29, R67.reuse, R78.reuse, R29;                         /* 0x59800e8004e7431d */
+        /*05b8*/                   FFMA R28, R67, R79.reuse, R28;                               /* 0x59800e0004f7431c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*05c8*/                   FFMA R30, R65.reuse, R79.reuse, R30;                         /* 0x59800f0004f7411e */
+        /*05d0*/                   FFMA R31, R65, R78.reuse, R31;                               /* 0x59800f8004e7411f */
+        /*05d8*/                   FFMA R25, R66.reuse, R78.reuse, R25;                         /* 0x59800c8004e74219 */
+                                                                                                /* 0x001fc440fe2407f1 */
+        /*05e8*/                   FFMA R24, R66, R79.reuse, R24;                               /* 0x59800c0004f74218 */
+        /*05f0*/                   FFMA R26, R64.reuse, R79, R26;                               /* 0x59800d0004f7401a */
+        /*05f8*/                   FFMA R27, R64, R78, R27;                                     /* 0x59800d8004e7401b */
+                                                                                                /* 0x101fc400fe260ff0 */
+        /*0608*/         {         FFMA R1, R82.reuse, R88.reuse, R1;                           /* 0x5980008005875201 */
+        /*0610*/                   LDS.U.128 R64, [R114+0x200];        }                        /* 0xef4e100020077240 */
+        /*0618*/                   FFMA R0, R82, R89.reuse, R0;                                 /* 0x5980000005975200 */
+                                                                                                /* 0x101fc400fe2607f0 */
+        /*0628*/         {         FFMA R2, R80.reuse, R89.reuse, R2;                           /* 0x5980010005975002 */
+        /*0630*/                   LDS.U.128 R72, [R115+0x200];        }                        /* 0xef4e100020077348 */
+        /*0638*/                   FFMA R3, R80, R88.reuse, R3;                                 /* 0x5980018005875003 */
+                                                                                                /* 0x101fc400fe2607f0 */
+        /*0648*/         {         FFMA R5, R83.reuse, R88.reuse, R5;                           /* 0x5980028005875305 */
+        /*0650*/                   LDS.U.128 R68, [R114+0x280];        }                        /* 0xef4e100028077244 */
+        /*0658*/                   FFMA R4, R83, R89.reuse, R4;                                 /* 0x5980020005975304 */
+                                                                                                /* 0x101fc400e22607f0 */
+        /*0668*/         {         FFMA R6, R81.reuse, R89.reuse, R6;                           /* 0x5980030005975106 */
+        /*0670*/                   LDS.U.128 R76, [R115+0x280];        }                        /* 0xef4e10002807734c */
+        /*0678*/                   FFMA R7, R81, R88.reuse, R7;                                 /* 0x5980038005875107 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0688*/                   FFMA R33, R86.reuse, R88.reuse, R33;                         /* 0x5980108005875621 */
+        /*0690*/                   FFMA R32, R86, R89.reuse, R32;                               /* 0x5980100005975620 */
+        /*0698*/                   FFMA R34, R84.reuse, R89.reuse, R34;                         /* 0x5980110005975422 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*06a8*/                   FFMA R35, R84, R88.reuse, R35;                               /* 0x5980118005875423 */
+        /*06b0*/                   FFMA R37, R87.reuse, R88.reuse, R37;                         /* 0x5980128005875725 */
+        /*06b8*/                   FFMA R36, R87.reuse, R89.reuse, R36;                         /* 0x5980120005975724 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*06c8*/                   FFMA R38, R85.reuse, R89, R38;                               /* 0x5980130005975526 */
+        /*06d0*/                   FFMA R39, R85.reuse, R88, R39;                               /* 0x5980138005875527 */
+        /*06d8*/                   FFMA R45, R87.reuse, R90.reuse, R45;                         /* 0x5980168005a7572d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*06e8*/                   FFMA R44, R87, R91.reuse, R44;                               /* 0x5980160005b7572c */
+        /*06f0*/                   FFMA R46, R85.reuse, R91.reuse, R46;                         /* 0x5980170005b7552e */
+        /*06f8*/                   FFMA R47, R85, R90.reuse, R47;                               /* 0x5980178005a7552f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0708*/                   FFMA R41, R86.reuse, R90.reuse, R41;                         /* 0x5980148005a75629 */
+        /*0710*/                   FFMA R40, R86, R91.reuse, R40;                               /* 0x5980140005b75628 */
+        /*0718*/                   FFMA R42, R84.reuse, R91.reuse, R42;                         /* 0x5980150005b7542a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0728*/                   FFMA R43, R84, R90.reuse, R43;                               /* 0x5980158005a7542b */
+        /*0730*/                   FFMA R13, R83.reuse, R90.reuse, R13;                         /* 0x5980068005a7530d */
+        /*0738*/                   FFMA R12, R83, R91.reuse, R12;                               /* 0x5980060005b7530c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0748*/                   FFMA R14, R81.reuse, R91.reuse, R14;                         /* 0x5980070005b7510e */
+        /*0750*/                   FFMA R15, R81, R90.reuse, R15;                               /* 0x5980078005a7510f */
+        /*0758*/                   FFMA R9, R82.reuse, R90.reuse, R9;                           /* 0x5980048005a75209 */
+                                                                                                /* 0x081fc040fe2607f1 */
+        /*0768*/                   FFMA R8, R82.reuse, R91.reuse, R8;                           /* 0x5980040005b75208 */
+        /*0770*/                   FFMA R10, R80.reuse, R91, R10;                               /* 0x5980050005b7500a */
+        /*0778*/         {         FFMA R11, R80.reuse, R90, R11;                               /* 0x5980058005a7500b */
+        /*0788*/               @P0 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf;        }          /* 0x101fc0c0fc2407f1 */
+                                                                                                /* 0xdd38000787107868 */
+        /*0790*/                   FFMA R17, R82.reuse, R92.reuse, R17;                         /* 0x5980088005c75211 */
+        /*0798*/         {         FFMA R16, R82, R93.reuse, R16;                               /* 0x5980080005d75210 */
+        /*07a8*/               @P0 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf;        }          /* 0x101fc4c0fe240751 */
+                                                                                                /* 0xdd38000787107c6c */
+        /*07b0*/                   FFMA R18, R80.reuse, R93.reuse, R18;                         /* 0x5980090005d75012 */
+        /*07b8*/                   FFMA R19, R80, R92.reuse, R19;                               /* 0x5980098005c75013 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*07c8*/                   FFMA R21, R83.reuse, R92.reuse, R21;                         /* 0x59800a8005c75315 */
+        /*07d0*/                   FFMA R20, R83, R93.reuse, R20;                               /* 0x59800a0005d75314 */
+        /*07d8*/                   FFMA R22, R81.reuse, R93.reuse, R22;                         /* 0x59800b0005d75116 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*07e8*/                   FFMA R23, R81, R92.reuse, R23;                               /* 0x59800b8005c75117 */
+        /*07f0*/                   FFMA R49, R86.reuse, R92.reuse, R49;                         /* 0x5980188005c75631 */
+        /*07f8*/                   FFMA R48, R86, R93.reuse, R48;                               /* 0x5980180005d75630 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0808*/                   FFMA R50, R84.reuse, R93.reuse, R50;                         /* 0x5980190005d75432 */
+        /*0810*/                   FFMA R51, R84, R92.reuse, R51;                               /* 0x5980198005c75433 */
+        /*0818*/                   FFMA R53, R87.reuse, R92.reuse, R53;                         /* 0x59801a8005c75735 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*0828*/                   FFMA R52, R87.reuse, R93.reuse, R52;                         /* 0x59801a0005d75734 */
+        /*0830*/                   FFMA R54, R85.reuse, R93, R54;                               /* 0x59801b0005d75536 */
+        /*0838*/                   FFMA R55, R85.reuse, R92, R55;                               /* 0x59801b8005c75537 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0848*/                   FFMA R61, R87.reuse, R94.reuse, R61;                         /* 0x59801e8005e7573d */
+        /*0850*/                   FFMA R60, R87, R95.reuse, R60;                               /* 0x59801e0005f7573c */
+        /*0858*/                   FFMA R62, R85.reuse, R95.reuse, R62;                         /* 0x59801f0005f7553e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0868*/                   FFMA R63, R85, R94.reuse, R63;                               /* 0x59801f8005e7553f */
+        /*0870*/                   FFMA R57, R86.reuse, R94.reuse, R57;                         /* 0x59801c8005e75639 */
+        /*0878*/                   FFMA R56, R86, R95.reuse, R56;                               /* 0x59801c0005f75638 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0888*/                   FFMA R58, R84.reuse, R95.reuse, R58;                         /* 0x59801d0005f7543a */
+        /*0890*/                   FFMA R59, R84, R94.reuse, R59;                               /* 0x59801d8005e7543b */
+        /*0898*/                   FFMA R29, R83.reuse, R94.reuse, R29;                         /* 0x59800e8005e7531d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*08a8*/                   FFMA R28, R83, R95.reuse, R28;                               /* 0x59800e0005f7531c */
+        /*08b0*/                   FFMA R30, R81.reuse, R95.reuse, R30;                         /* 0x59800f0005f7511e */
+        /*08b8*/                   FFMA R31, R81, R94.reuse, R31;                               /* 0x59800f8005e7511f */
+                                                                                                /* 0x081fc480fe2607f1 */
+        /*08c8*/                   FFMA R25, R82.reuse, R94.reuse, R25;                         /* 0x59800c8005e75219 */
+        /*08d0*/                   FFMA R24, R82, R95.reuse, R24;                               /* 0x59800c0005f75218 */
+        /*08d8*/                   FFMA R26, R80.reuse, R95, R26;                               /* 0x59800d0005f7501a */
+                                                                                                /* 0x001fc4c1fe0007f1 */
+        /*08e8*/                   FFMA R27, R80, R94, R27;                                     /* 0x59800d8005e7501b */
+        /*08f0*/         {         FFMA R1, R66.reuse, R72.reuse, R1;                           /* 0x5980008004874201 */
+        /*08f8*/                   LDS.U.128 R80, [R114+0x300];        }                        /* 0xef4e100030077250 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*0908*/                   FFMA R0, R66, R73.reuse, R0;                                 /* 0x5980000004974200 */
+        /*0910*/         {         FFMA R2, R64.reuse, R73.reuse, R2;                           /* 0x5980010004974002 */
+        /*0918*/                   LDS.U.128 R88, [R115+0x300];        }                        /* 0xef4e100030077358 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*0928*/                   FFMA R3, R64, R72.reuse, R3;                                 /* 0x5980018004874003 */
+        /*0930*/         {         FFMA R5, R67.reuse, R72.reuse, R5;                           /* 0x5980028004874305 */
+        /*0938*/                   LDS.U.128 R84, [R114+0x380];        }                        /* 0xef4e100038077254 */
+                                                                                                /* 0x001c44c0fe0407f1 */
+        /*0948*/                   FFMA R4, R67, R73.reuse, R4;                                 /* 0x5980020004974304 */
+        /*0950*/         {         FFMA R6, R65.reuse, R73.reuse, R6;                           /* 0x5980030004974106 */
+        /*0958*/                   LDS.U.128 R92, [R115+0x380];        }                        /* 0xef4e10003807735c */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0968*/                   FFMA R7, R65, R72.reuse, R7;                                 /* 0x5980038004874107 */
+        /*0970*/                   FFMA R33, R70.reuse, R72.reuse, R33;                         /* 0x5980108004874621 */
+        /*0978*/                   FFMA R32, R70, R73.reuse, R32;                               /* 0x5980100004974620 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0988*/                   FFMA R34, R68.reuse, R73.reuse, R34;                         /* 0x5980110004974422 */
+        /*0990*/                   FFMA R35, R68, R72.reuse, R35;                               /* 0x5980118004874423 */
+        /*0998*/                   FFMA R37, R71.reuse, R72.reuse, R37;                         /* 0x5980128004874725 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*09a8*/                   FFMA R36, R71.reuse, R73.reuse, R36;                         /* 0x5980120004974724 */
+        /*09b0*/                   FFMA R38, R69.reuse, R73, R38;                               /* 0x5980130004974526 */
+        /*09b8*/                   FFMA R39, R69.reuse, R72, R39;                               /* 0x5980138004874527 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*09c8*/                   FFMA R45, R71.reuse, R74.reuse, R45;                         /* 0x5980168004a7472d */
+        /*09d0*/                   FFMA R44, R71, R75.reuse, R44;                               /* 0x5980160004b7472c */
+        /*09d8*/                   FFMA R46, R69.reuse, R75.reuse, R46;                         /* 0x5980170004b7452e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*09e8*/                   FFMA R47, R69, R74.reuse, R47;                               /* 0x5980178004a7452f */
+        /*09f0*/                   FFMA R41, R70.reuse, R74.reuse, R41;                         /* 0x5980148004a74629 */
+        /*09f8*/                   FFMA R40, R70, R75.reuse, R40;                               /* 0x5980140004b74628 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0a08*/                   FFMA R42, R68.reuse, R75.reuse, R42;                         /* 0x5980150004b7442a */
+        /*0a10*/                   FFMA R43, R68, R74.reuse, R43;                               /* 0x5980158004a7442b */
+        /*0a18*/                   FFMA R13, R67.reuse, R74.reuse, R13;                         /* 0x5980068004a7430d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0a28*/                   FFMA R12, R67, R75.reuse, R12;                               /* 0x5980060004b7430c */
+        /*0a30*/                   FFMA R14, R65.reuse, R75.reuse, R14;                         /* 0x5980070004b7410e */
+        /*0a38*/                   FFMA R15, R65, R74.reuse, R15;                               /* 0x5980078004a7410f */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*0a48*/                   FFMA R9, R66.reuse, R74.reuse, R9;                           /* 0x5980048004a74209 */
+        /*0a50*/                   FFMA R8, R66.reuse, R75.reuse, R8;                           /* 0x5980040004b74208 */
+        /*0a58*/                   FFMA R10, R64.reuse, R75, R10;                               /* 0x5980050004b7400a */
+                                                                                                /* 0x101fc4c0fc2207f1 */
+        /*0a68*/                   FFMA R11, R64.reuse, R74, R11;                               /* 0x5980058004a7400b */
+        /*0a70*/                   FFMA R17, R66.reuse, R76.reuse, R17;                         /* 0x5980088004c74211 */
+        /*0a78*/                   FFMA R16, R66, R77.reuse, R16;                               /* 0x5980080004d74210 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0a88*/                   FFMA R18, R64.reuse, R77.reuse, R18;                         /* 0x5980090004d74012 */
+        /*0a90*/                   FFMA R19, R64, R76.reuse, R19;                               /* 0x5980098004c74013 */
+        /*0a98*/                   FFMA R21, R67.reuse, R76.reuse, R21;                         /* 0x59800a8004c74315 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0aa8*/                   FFMA R20, R67, R77.reuse, R20;                               /* 0x59800a0004d74314 */
+        /*0ab0*/                   FFMA R22, R65.reuse, R77.reuse, R22;                         /* 0x59800b0004d74116 */
+        /*0ab8*/                   FFMA R23, R65, R76.reuse, R23;                               /* 0x59800b8004c74117 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0ac8*/                   FFMA R49, R70.reuse, R76.reuse, R49;                         /* 0x5980188004c74631 */
+        /*0ad0*/                   FFMA R48, R70, R77.reuse, R48;                               /* 0x5980180004d74630 */
+        /*0ad8*/                   FFMA R50, R68.reuse, R77.reuse, R50;                         /* 0x5980190004d74432 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0ae8*/                   FFMA R51, R68, R76.reuse, R51;                               /* 0x5980198004c74433 */
+        /*0af0*/                   FFMA R53, R71.reuse, R76.reuse, R53;                         /* 0x59801a8004c74735 */
+        /*0af8*/                   FFMA R52, R71.reuse, R77.reuse, R52;                         /* 0x59801a0004d74734 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*0b08*/                   FFMA R54, R69.reuse, R77, R54;                               /* 0x59801b0004d74536 */
+        /*0b10*/                   FFMA R55, R69.reuse, R76, R55;                               /* 0x59801b8004c74537 */
+        /*0b18*/                   FFMA R61, R71.reuse, R78.reuse, R61;                         /* 0x59801e8004e7473d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0b28*/                   FFMA R60, R71, R79.reuse, R60;                               /* 0x59801e0004f7473c */
+        /*0b30*/                   FFMA R62, R69.reuse, R79.reuse, R62;                         /* 0x59801f0004f7453e */
+        /*0b38*/                   FFMA R63, R69, R78.reuse, R63;                               /* 0x59801f8004e7453f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0b48*/                   FFMA R57, R70.reuse, R78.reuse, R57;                         /* 0x59801c8004e74639 */
+        /*0b50*/                   FFMA R56, R70, R79.reuse, R56;                               /* 0x59801c0004f74638 */
+        /*0b58*/                   FFMA R58, R68.reuse, R79.reuse, R58;                         /* 0x59801d0004f7443a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0b68*/                   FFMA R59, R68, R78.reuse, R59;                               /* 0x59801d8004e7443b */
+        /*0b70*/                   FFMA R29, R67.reuse, R78.reuse, R29;                         /* 0x59800e8004e7431d */
+        /*0b78*/                   FFMA R28, R67, R79.reuse, R28;                               /* 0x59800e0004f7431c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0b88*/                   FFMA R30, R65.reuse, R79.reuse, R30;                         /* 0x59800f0004f7411e */
+        /*0b90*/                   FFMA R31, R65, R78.reuse, R31;                               /* 0x59800f8004e7411f */
+        /*0b98*/                   FFMA R25, R66.reuse, R78.reuse, R25;                         /* 0x59800c8004e74219 */
+                                                                                                /* 0x001fc440fe2407f1 */
+        /*0ba8*/                   FFMA R24, R66, R79.reuse, R24;                               /* 0x59800c0004f74218 */
+        /*0bb0*/                   FFMA R26, R64.reuse, R79, R26;                               /* 0x59800d0004f7401a */
+        /*0bb8*/                   FFMA R27, R64, R78, R27;                                     /* 0x59800d8004e7401b */
+                                                                                                /* 0x101fc400fe260ff0 */
+        /*0bc8*/         {         FFMA R1, R82.reuse, R88.reuse, R1;                           /* 0x5980008005875201 */
+        /*0bd0*/                   LDS.U.128 R64, [R114+0x400];        }                        /* 0xef4e100040077240 */
+        /*0bd8*/                   FFMA R0, R82, R89.reuse, R0;                                 /* 0x5980000005975200 */
+                                                                                                /* 0x101fc400fe2607f0 */
+        /*0be8*/         {         FFMA R2, R80.reuse, R89.reuse, R2;                           /* 0x5980010005975002 */
+        /*0bf0*/                   LDS.U.128 R72, [R115+0x400];        }                        /* 0xef4e100040077348 */
+        /*0bf8*/                   FFMA R3, R80, R88.reuse, R3;                                 /* 0x5980018005875003 */
+                                                                                                /* 0x101fc400fe2607f0 */
+        /*0c08*/         {         FFMA R5, R83.reuse, R88.reuse, R5;                           /* 0x5980028005875305 */
+        /*0c10*/                   LDS.U.128 R68, [R114+0x480];        }                        /* 0xef4e100048077244 */
+        /*0c18*/                   FFMA R4, R83, R89.reuse, R4;                                 /* 0x5980020005975304 */
+                                                                                                /* 0x101fc400e22607f0 */
+        /*0c28*/         {         FFMA R6, R81.reuse, R89.reuse, R6;                           /* 0x5980030005975106 */
+        /*0c30*/                   LDS.U.128 R76, [R115+0x480];        }                        /* 0xef4e10004807734c */
+        /*0c38*/                   FFMA R7, R81, R88.reuse, R7;                                 /* 0x5980038005875107 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0c48*/                   FFMA R33, R86.reuse, R88.reuse, R33;                         /* 0x5980108005875621 */
+        /*0c50*/                   FFMA R32, R86, R89.reuse, R32;                               /* 0x5980100005975620 */
+        /*0c58*/                   FFMA R34, R84.reuse, R89.reuse, R34;                         /* 0x5980110005975422 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0c68*/                   FFMA R35, R84, R88.reuse, R35;                               /* 0x5980118005875423 */
+        /*0c70*/                   FFMA R37, R87.reuse, R88.reuse, R37;                         /* 0x5980128005875725 */
+        /*0c78*/                   FFMA R36, R87.reuse, R89.reuse, R36;                         /* 0x5980120005975724 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*0c88*/                   FFMA R38, R85.reuse, R89, R38;                               /* 0x5980130005975526 */
+        /*0c90*/                   FFMA R39, R85.reuse, R88, R39;                               /* 0x5980138005875527 */
+        /*0c98*/                   FFMA R45, R87.reuse, R90.reuse, R45;                         /* 0x5980168005a7572d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0ca8*/                   FFMA R44, R87, R91.reuse, R44;                               /* 0x5980160005b7572c */
+        /*0cb0*/                   FFMA R46, R85.reuse, R91.reuse, R46;                         /* 0x5980170005b7552e */
+        /*0cb8*/                   FFMA R47, R85, R90.reuse, R47;                               /* 0x5980178005a7552f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0cc8*/                   FFMA R41, R86.reuse, R90.reuse, R41;                         /* 0x5980148005a75629 */
+        /*0cd0*/                   FFMA R40, R86, R91.reuse, R40;                               /* 0x5980140005b75628 */
+        /*0cd8*/                   FFMA R42, R84.reuse, R91.reuse, R42;                         /* 0x5980150005b7542a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0ce8*/                   FFMA R43, R84, R90.reuse, R43;                               /* 0x5980158005a7542b */
+        /*0cf0*/                   FFMA R13, R83.reuse, R90.reuse, R13;                         /* 0x5980068005a7530d */
+        /*0cf8*/                   FFMA R12, R83, R91.reuse, R12;                               /* 0x5980060005b7530c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0d08*/                   FFMA R14, R81.reuse, R91.reuse, R14;                         /* 0x5980070005b7510e */
+        /*0d10*/                   FFMA R15, R81, R90.reuse, R15;                               /* 0x5980078005a7510f */
+        /*0d18*/                   FFMA R9, R82.reuse, R90.reuse, R9;                           /* 0x5980048005a75209 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*0d28*/                   FFMA R8, R82.reuse, R91.reuse, R8;                           /* 0x5980040005b75208 */
+        /*0d30*/                   FFMA R10, R80.reuse, R91, R10;                               /* 0x5980050005b7500a */
+        /*0d38*/                   FFMA R11, R80.reuse, R90, R11;                               /* 0x5980058005a7500b */
+                                                                                                /* 0x181fc480fe2607e1 */
+        /*0d48*/                   FFMA R17, R82.reuse, R92.reuse, R17;                         /* 0x5980088005c75211 */
+        /*0d50*/                   FFMA R16, R82, R93.reuse, R16;                               /* 0x5980080005d75210 */
+        /*0d58*/                   FFMA R18, R80.reuse, R93.reuse, R18;                         /* 0x5980090005d75012 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0d68*/                   FFMA R19, R80, R92.reuse, R19;                               /* 0x5980098005c75013 */
+        /*0d70*/                   FFMA R21, R83.reuse, R92.reuse, R21;                         /* 0x59800a8005c75315 */
+        /*0d78*/                   FFMA R20, R83, R93.reuse, R20;                               /* 0x59800a0005d75314 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0d88*/                   FFMA R22, R81.reuse, R93.reuse, R22;                         /* 0x59800b0005d75116 */
+        /*0d90*/                   FFMA R23, R81, R92.reuse, R23;                               /* 0x59800b8005c75117 */
+        /*0d98*/                   FFMA R49, R86.reuse, R92.reuse, R49;                         /* 0x5980188005c75631 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0da8*/                   FFMA R48, R86, R93.reuse, R48;                               /* 0x5980180005d75630 */
+        /*0db0*/                   FFMA R50, R84.reuse, R93.reuse, R50;                         /* 0x5980190005d75432 */
+        /*0db8*/                   FFMA R51, R84, R92.reuse, R51;                               /* 0x5980198005c75433 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*0dc8*/                   FFMA R53, R87.reuse, R92.reuse, R53;                         /* 0x59801a8005c75735 */
+        /*0dd0*/                   FFMA R52, R87.reuse, R93.reuse, R52;                         /* 0x59801a0005d75734 */
+        /*0dd8*/                   FFMA R54, R85.reuse, R93, R54;                               /* 0x59801b0005d75536 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*0de8*/                   FFMA R55, R85.reuse, R92, R55;                               /* 0x59801b8005c75537 */
+        /*0df0*/                   FFMA R61, R87.reuse, R94.reuse, R61;                         /* 0x59801e8005e7573d */
+        /*0df8*/                   FFMA R60, R87, R95.reuse, R60;                               /* 0x59801e0005f7573c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0e08*/                   FFMA R62, R85.reuse, R95.reuse, R62;                         /* 0x59801f0005f7553e */
+        /*0e10*/                   FFMA R63, R85, R94.reuse, R63;                               /* 0x59801f8005e7553f */
+        /*0e18*/                   FFMA R57, R86.reuse, R94.reuse, R57;                         /* 0x59801c8005e75639 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0e28*/                   FFMA R56, R86, R95.reuse, R56;                               /* 0x59801c0005f75638 */
+        /*0e30*/                   FFMA R58, R84.reuse, R95.reuse, R58;                         /* 0x59801d0005f7543a */
+        /*0e38*/                   FFMA R59, R84, R94.reuse, R59;                               /* 0x59801d8005e7543b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0e48*/                   FFMA R29, R83.reuse, R94.reuse, R29;                         /* 0x59800e8005e7531d */
+        /*0e50*/                   FFMA R28, R83, R95.reuse, R28;                               /* 0x59800e0005f7531c */
+        /*0e58*/                   FFMA R30, R81.reuse, R95.reuse, R30;                         /* 0x59800f0005f7511e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0e68*/                   FFMA R31, R81, R94.reuse, R31;                               /* 0x59800f8005e7511f */
+        /*0e70*/                   FFMA R25, R82.reuse, R94.reuse, R25;                         /* 0x59800c8005e75219 */
+        /*0e78*/                   FFMA R24, R82, R95.reuse, R24;                               /* 0x59800c0005f75218 */
+                                                                                                /* 0x183fc000fe2207f1 */
+        /*0e88*/                   FFMA R26, R80.reuse, R95, R26;                               /* 0x59800d0005f7501a */
+        /*0e90*/                   FFMA R27, R80, R94, R27;                                     /* 0x59800d8005e7501b */
+        /*0e98*/         {         FFMA R1, R66.reuse, R72.reuse, R1;                           /* 0x5980008004874201 */
+        /*0ea8*/                   LDS.U.128 R80, [R114+0x500];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100050077250 */
+        /*0eb0*/                   FFMA R0, R66, R73.reuse, R0;                                 /* 0x5980000004974200 */
+        /*0eb8*/         {         FFMA R2, R64.reuse, R73.reuse, R2;                           /* 0x5980010004974002 */
+        /*0ec8*/                   LDS.U.128 R88, [R115+0x500];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100050077358 */
+        /*0ed0*/                   FFMA R3, R64, R72.reuse, R3;                                 /* 0x5980018004874003 */
+        /*0ed8*/         {         FFMA R5, R67.reuse, R72.reuse, R5;                           /* 0x5980028004874305 */
+        /*0ee8*/                   LDS.U.128 R84, [R114+0x580];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100058077254 */
+        /*0ef0*/                   FFMA R4, R67, R73.reuse, R4;                                 /* 0x5980020004974304 */
+        /*0ef8*/         {         FFMA R6, R65.reuse, R73.reuse, R6;                           /* 0x5980030004974106 */
+        /*0f08*/                   LDS.U.128 R92, [R115+0x580];        }                        /* 0x181fc480fe200711 */
+                                                                                                /* 0xef4e10005807735c */
+        /*0f10*/                   FFMA R7, R65, R72.reuse, R7;                                 /* 0x5980038004874107 */
+        /*0f18*/                   FFMA R33, R70.reuse, R72.reuse, R33;                         /* 0x5980108004874621 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0f28*/                   FFMA R32, R70, R73.reuse, R32;                               /* 0x5980100004974620 */
+        /*0f30*/                   FFMA R34, R68.reuse, R73.reuse, R34;                         /* 0x5980110004974422 */
+        /*0f38*/                   FFMA R35, R68, R72.reuse, R35;                               /* 0x5980118004874423 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*0f48*/                   FFMA R37, R71.reuse, R72.reuse, R37;                         /* 0x5980128004874725 */
+        /*0f50*/                   FFMA R36, R71.reuse, R73.reuse, R36;                         /* 0x5980120004974724 */
+        /*0f58*/                   FFMA R38, R69.reuse, R73, R38;                               /* 0x5980130004974526 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*0f68*/                   FFMA R39, R69.reuse, R72, R39;                               /* 0x5980138004874527 */
+        /*0f70*/                   FFMA R45, R71.reuse, R74.reuse, R45;                         /* 0x5980168004a7472d */
+        /*0f78*/                   FFMA R44, R71, R75.reuse, R44;                               /* 0x5980160004b7472c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0f88*/                   FFMA R46, R69.reuse, R75.reuse, R46;                         /* 0x5980170004b7452e */
+        /*0f90*/                   FFMA R47, R69, R74.reuse, R47;                               /* 0x5980178004a7452f */
+        /*0f98*/                   FFMA R41, R70.reuse, R74.reuse, R41;                         /* 0x5980148004a74629 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0fa8*/                   FFMA R40, R70, R75.reuse, R40;                               /* 0x5980140004b74628 */
+        /*0fb0*/                   FFMA R42, R68.reuse, R75.reuse, R42;                         /* 0x5980150004b7442a */
+        /*0fb8*/                   FFMA R43, R68, R74.reuse, R43;                               /* 0x5980158004a7442b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0fc8*/                   FFMA R13, R67.reuse, R74.reuse, R13;                         /* 0x5980068004a7430d */
+        /*0fd0*/                   FFMA R12, R67, R75.reuse, R12;                               /* 0x5980060004b7430c */
+        /*0fd8*/                   FFMA R14, R65.reuse, R75.reuse, R14;                         /* 0x5980070004b7410e */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0fe8*/                   FFMA R15, R65, R74.reuse, R15;                               /* 0x5980078004a7410f */
+        /*0ff0*/                   FFMA R9, R66.reuse, R74.reuse, R9;                           /* 0x5980048004a74209 */
+        /*0ff8*/                   FFMA R8, R66.reuse, R75.reuse, R8;                           /* 0x5980040004b74208 */
+                                                                                                /* 0x181f8440fe2207f1 */
+        /*1008*/                   FFMA R10, R64.reuse, R75, R10;                               /* 0x5980050004b7400a */
+        /*1010*/                   FFMA R11, R64.reuse, R74, R11;                               /* 0x5980058004a7400b */
+        /*1018*/                   FFMA R17, R66.reuse, R76.reuse, R17;                         /* 0x5980088004c74211 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1028*/                   FFMA R16, R66, R77.reuse, R16;                               /* 0x5980080004d74210 */
+        /*1030*/                   FFMA R18, R64.reuse, R77.reuse, R18;                         /* 0x5980090004d74012 */
+        /*1038*/                   FFMA R19, R64, R76.reuse, R19;                               /* 0x5980098004c74013 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1048*/                   FFMA R21, R67.reuse, R76.reuse, R21;                         /* 0x59800a8004c74315 */
+        /*1050*/                   FFMA R20, R67, R77.reuse, R20;                               /* 0x59800a0004d74314 */
+        /*1058*/                   FFMA R22, R65.reuse, R77.reuse, R22;                         /* 0x59800b0004d74116 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1068*/                   FFMA R23, R65, R76.reuse, R23;                               /* 0x59800b8004c74117 */
+        /*1070*/                   FFMA R49, R70.reuse, R76.reuse, R49;                         /* 0x5980188004c74631 */
+        /*1078*/                   FFMA R48, R70, R77.reuse, R48;                               /* 0x5980180004d74630 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1088*/                   FFMA R50, R68.reuse, R77.reuse, R50;                         /* 0x5980190004d74432 */
+        /*1090*/                   FFMA R51, R68, R76.reuse, R51;                               /* 0x5980198004c74433 */
+        /*1098*/                   FFMA R53, R71.reuse, R76.reuse, R53;                         /* 0x59801a8004c74735 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*10a8*/                   FFMA R52, R71.reuse, R77.reuse, R52;                         /* 0x59801a0004d74734 */
+        /*10b0*/                   FFMA R54, R69.reuse, R77, R54;                               /* 0x59801b0004d74536 */
+        /*10b8*/                   FFMA R55, R69.reuse, R76, R55;                               /* 0x59801b8004c74537 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*10c8*/                   FFMA R61, R71.reuse, R78.reuse, R61;                         /* 0x59801e8004e7473d */
+        /*10d0*/                   FFMA R60, R71, R79.reuse, R60;                               /* 0x59801e0004f7473c */
+        /*10d8*/                   FFMA R62, R69.reuse, R79.reuse, R62;                         /* 0x59801f0004f7453e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*10e8*/                   FFMA R63, R69, R78.reuse, R63;                               /* 0x59801f8004e7453f */
+        /*10f0*/                   FFMA R57, R70.reuse, R78.reuse, R57;                         /* 0x59801c8004e74639 */
+        /*10f8*/                   FFMA R56, R70, R79.reuse, R56;                               /* 0x59801c0004f74638 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1108*/                   FFMA R58, R68.reuse, R79.reuse, R58;                         /* 0x59801d0004f7443a */
+        /*1110*/                   FFMA R59, R68, R78.reuse, R59;                               /* 0x59801d8004e7443b */
+        /*1118*/                   FFMA R29, R67.reuse, R78.reuse, R29;                         /* 0x59800e8004e7431d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1128*/                   FFMA R28, R67, R79.reuse, R28;                               /* 0x59800e0004f7431c */
+        /*1130*/                   FFMA R30, R65.reuse, R79.reuse, R30;                         /* 0x59800f0004f7411e */
+        /*1138*/                   FFMA R31, R65, R78.reuse, R31;                               /* 0x59800f8004e7411f */
+                                                                                                /* 0x081fc480fe2607f1 */
+        /*1148*/                   FFMA R25, R66.reuse, R78.reuse, R25;                         /* 0x59800c8004e74219 */
+        /*1150*/                   FFMA R24, R66, R79.reuse, R24;                               /* 0x59800c0004f74218 */
+        /*1158*/                   FFMA R26, R64.reuse, R79, R26;                               /* 0x59800d0004f7401a */
+                                                                                                /* 0x001fc4c1fe0007f1 */
+        /*1168*/                   FFMA R27, R64, R78, R27;                                     /* 0x59800d8004e7401b */
+        /*1170*/         {         FFMA R1, R82.reuse, R88.reuse, R1;                           /* 0x5980008005875201 */
+        /*1178*/                   LDS.U.128 R64, [R114+0x600];        }                        /* 0xef4e100060077240 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*1188*/                   FFMA R0, R82, R89.reuse, R0;                                 /* 0x5980000005975200 */
+        /*1190*/         {         FFMA R2, R80.reuse, R89.reuse, R2;                           /* 0x5980010005975002 */
+        /*1198*/                   LDS.U.128 R72, [R115+0x600];        }                        /* 0xef4e100060077348 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*11a8*/                   FFMA R3, R80, R88.reuse, R3;                                 /* 0x5980018005875003 */
+        /*11b0*/         {         FFMA R5, R83.reuse, R88.reuse, R5;                           /* 0x5980028005875305 */
+        /*11b8*/                   LDS.U.128 R68, [R114+0x680];        }                        /* 0xef4e100068077244 */
+                                                                                                /* 0x001c44c0fe0407f1 */
+        /*11c8*/                   FFMA R4, R83, R89.reuse, R4;                                 /* 0x5980020005975304 */
+        /*11d0*/         {         FFMA R6, R81.reuse, R89.reuse, R6;                           /* 0x5980030005975106 */
+        /*11d8*/                   LDS.U.128 R76, [R115+0x680];        }                        /* 0xef4e10006807734c */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*11e8*/                   FFMA R7, R81, R88.reuse, R7;                                 /* 0x5980038005875107 */
+        /*11f0*/                   FFMA R33, R86.reuse, R88.reuse, R33;                         /* 0x5980108005875621 */
+        /*11f8*/                   FFMA R32, R86, R89.reuse, R32;                               /* 0x5980100005975620 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1208*/                   FFMA R34, R84.reuse, R89.reuse, R34;                         /* 0x5980110005975422 */
+        /*1210*/                   FFMA R35, R84, R88.reuse, R35;                               /* 0x5980118005875423 */
+        /*1218*/                   FFMA R37, R87.reuse, R88.reuse, R37;                         /* 0x5980128005875725 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*1228*/                   FFMA R36, R87.reuse, R89.reuse, R36;                         /* 0x5980120005975724 */
+        /*1230*/                   FFMA R38, R85.reuse, R89, R38;                               /* 0x5980130005975526 */
+        /*1238*/                   FFMA R39, R85.reuse, R88, R39;                               /* 0x5980138005875527 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1248*/                   FFMA R45, R87.reuse, R90.reuse, R45;                         /* 0x5980168005a7572d */
+        /*1250*/                   FFMA R44, R87, R91.reuse, R44;                               /* 0x5980160005b7572c */
+        /*1258*/                   FFMA R46, R85.reuse, R91.reuse, R46;                         /* 0x5980170005b7552e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1268*/                   FFMA R47, R85, R90.reuse, R47;                               /* 0x5980178005a7552f */
+        /*1270*/                   FFMA R41, R86.reuse, R90.reuse, R41;                         /* 0x5980148005a75629 */
+        /*1278*/                   FFMA R40, R86, R91.reuse, R40;                               /* 0x5980140005b75628 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1288*/                   FFMA R42, R84.reuse, R91.reuse, R42;                         /* 0x5980150005b7542a */
+        /*1290*/                   FFMA R43, R84, R90.reuse, R43;                               /* 0x5980158005a7542b */
+        /*1298*/                   FFMA R13, R83.reuse, R90.reuse, R13;                         /* 0x5980068005a7530d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*12a8*/                   FFMA R12, R83, R91.reuse, R12;                               /* 0x5980060005b7530c */
+        /*12b0*/                   FFMA R14, R81.reuse, R91.reuse, R14;                         /* 0x5980070005b7510e */
+        /*12b8*/                   FFMA R15, R81, R90.reuse, R15;                               /* 0x5980078005a7510f */
+                                                                                                /* 0x081fc0c0fe2607f1 */
+        /*12c8*/                   FFMA R9, R82.reuse, R90.reuse, R9;                           /* 0x5980048005a75209 */
+        /*12d0*/                   FFMA R8, R82.reuse, R91.reuse, R8;                           /* 0x5980040005b75208 */
+        /*12d8*/         {         FFMA R10, R80.reuse, R91, R10;                               /* 0x5980050005b7500a */
+        /*12e8*/               @P0 STS.128 [R118], R96;        }                                /* 0x181f8440fe2017f1 */
+                                                                                                /* 0xef5e000000007660 */
+        /*12f0*/                   FFMA R11, R80.reuse, R90, R11;                               /* 0x5980058005a7500b */
+        /*12f8*/                   FFMA R17, R82.reuse, R92.reuse, R17;                         /* 0x5980088005c75211 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*1308*/                   FFMA R16, R82, R93.reuse, R16;                               /* 0x5980080005d75210 */
+        /*1310*/         {         FFMA R18, R80.reuse, R93.reuse, R18;                         /* 0x5980090005d75012 */
+        /*1318*/               @P0 STS.128 [R118+0x200], R100;        }                         /* 0xef5e000020007664 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1328*/                   FFMA R19, R80, R92.reuse, R19;                               /* 0x5980098005c75013 */
+        /*1330*/                   FFMA R21, R83.reuse, R92.reuse, R21;                         /* 0x59800a8005c75315 */
+        /*1338*/                   FFMA R20, R83, R93.reuse, R20;                               /* 0x59800a0005d75314 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1348*/                   FFMA R22, R81.reuse, R93.reuse, R22;                         /* 0x59800b0005d75116 */
+        /*1350*/                   FFMA R23, R81, R92.reuse, R23;                               /* 0x59800b8005c75117 */
+        /*1358*/                   FFMA R49, R86.reuse, R92.reuse, R49;                         /* 0x5980188005c75631 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1368*/                   FFMA R48, R86, R93.reuse, R48;                               /* 0x5980180005d75630 */
+        /*1370*/                   FFMA R50, R84.reuse, R93.reuse, R50;                         /* 0x5980190005d75432 */
+        /*1378*/                   FFMA R51, R84, R92.reuse, R51;                               /* 0x5980198005c75433 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*1388*/                   FFMA R53, R87.reuse, R92.reuse, R53;                         /* 0x59801a8005c75735 */
+        /*1390*/                   FFMA R52, R87.reuse, R93.reuse, R52;                         /* 0x59801a0005d75734 */
+        /*1398*/                   FFMA R54, R85.reuse, R93, R54;                               /* 0x59801b0005d75536 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*13a8*/                   FFMA R55, R85.reuse, R92, R55;                               /* 0x59801b8005c75537 */
+        /*13b0*/                   FFMA R61, R87.reuse, R94.reuse, R61;                         /* 0x59801e8005e7573d */
+        /*13b8*/                   FFMA R60, R87, R95.reuse, R60;                               /* 0x59801e0005f7573c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*13c8*/                   FFMA R62, R85.reuse, R95.reuse, R62;                         /* 0x59801f0005f7553e */
+        /*13d0*/                   FFMA R63, R85, R94.reuse, R63;                               /* 0x59801f8005e7553f */
+        /*13d8*/                   FFMA R57, R86.reuse, R94.reuse, R57;                         /* 0x59801c8005e75639 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*13e8*/                   FFMA R56, R86, R95.reuse, R56;                               /* 0x59801c0005f75638 */
+        /*13f0*/                   FFMA R58, R84.reuse, R95.reuse, R58;                         /* 0x59801d0005f7543a */
+        /*13f8*/                   FFMA R59, R84, R94.reuse, R59;                               /* 0x59801d8005e7543b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1408*/                   FFMA R29, R83.reuse, R94.reuse, R29;                         /* 0x59800e8005e7531d */
+        /*1410*/                   FFMA R28, R83, R95.reuse, R28;                               /* 0x59800e0005f7531c */
+        /*1418*/                   FFMA R30, R81.reuse, R95.reuse, R30;                         /* 0x59800f0005f7511e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1428*/                   FFMA R31, R81, R94.reuse, R31;                               /* 0x59800f8005e7511f */
+        /*1430*/                   FFMA R25, R82.reuse, R94.reuse, R25;                         /* 0x59800c8005e75219 */
+        /*1438*/                   FFMA R24, R82, R95.reuse, R24;                               /* 0x59800c0005f75218 */
+                                                                                                /* 0x183fc000fe2207f1 */
+        /*1448*/                   FFMA R26, R80.reuse, R95, R26;                               /* 0x59800d0005f7501a */
+        /*1450*/                   FFMA R27, R80, R94, R27;                                     /* 0x59800d8005e7501b */
+        /*1458*/         {         FFMA R1, R66.reuse, R72.reuse, R1;                           /* 0x5980008004874201 */
+        /*1468*/                   LDS.U.128 R80, [R114+0x700];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100070077250 */
+        /*1470*/                   FFMA R0, R66, R73.reuse, R0;                                 /* 0x5980000004974200 */
+        /*1478*/         {         FFMA R2, R64.reuse, R73.reuse, R2;                           /* 0x5980010004974002 */
+        /*1488*/                   LDS.U.128 R88, [R115+0x700];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100070077358 */
+        /*1490*/                   FFMA R3, R64, R72.reuse, R3;                                 /* 0x5980018004874003 */
+        /*1498*/         {         FFMA R5, R67.reuse, R72.reuse, R5;                           /* 0x5980028004874305 */
+        /*14a8*/                   LDS.U.128 R84, [R114+0x780];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100078077254 */
+        /*14b0*/                   FFMA R4, R67, R73.reuse, R4;                                 /* 0x5980020004974304 */
+        /*14b8*/         {         FFMA R6, R65.reuse, R73.reuse, R6;                           /* 0x5980030004974106 */
+        /*14c8*/                   LDS.U.128 R92, [R115+0x780];        }                        /* 0x181fc480fe200711 */
+                                                                                                /* 0xef4e10007807735c */
+        /*14d0*/                   FFMA R7, R65, R72.reuse, R7;                                 /* 0x5980038004874107 */
+        /*14d8*/                   FFMA R33, R70.reuse, R72.reuse, R33;                         /* 0x5980108004874621 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*14e8*/                   FFMA R32, R70, R73.reuse, R32;                               /* 0x5980100004974620 */
+        /*14f0*/                   FFMA R34, R68.reuse, R73.reuse, R34;                         /* 0x5980110004974422 */
+        /*14f8*/                   FFMA R35, R68, R72.reuse, R35;                               /* 0x5980118004874423 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*1508*/                   FFMA R37, R71.reuse, R72.reuse, R37;                         /* 0x5980128004874725 */
+        /*1510*/                   FFMA R36, R71.reuse, R73.reuse, R36;                         /* 0x5980120004974724 */
+        /*1518*/                   FFMA R38, R69.reuse, R73, R38;                               /* 0x5980130004974526 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*1528*/                   FFMA R39, R69.reuse, R72, R39;                               /* 0x5980138004874527 */
+        /*1530*/                   FFMA R45, R71.reuse, R74.reuse, R45;                         /* 0x5980168004a7472d */
+        /*1538*/                   FFMA R44, R71, R75.reuse, R44;                               /* 0x5980160004b7472c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1548*/                   FFMA R46, R69.reuse, R75.reuse, R46;                         /* 0x5980170004b7452e */
+        /*1550*/                   FFMA R47, R69, R74.reuse, R47;                               /* 0x5980178004a7452f */
+        /*1558*/                   FFMA R41, R70.reuse, R74.reuse, R41;                         /* 0x5980148004a74629 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1568*/                   FFMA R40, R70, R75.reuse, R40;                               /* 0x5980140004b74628 */
+        /*1570*/                   FFMA R42, R68.reuse, R75.reuse, R42;                         /* 0x5980150004b7442a */
+        /*1578*/                   FFMA R43, R68, R74.reuse, R43;                               /* 0x5980158004a7442b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1588*/                   FFMA R13, R67.reuse, R74.reuse, R13;                         /* 0x5980068004a7430d */
+        /*1590*/                   FFMA R12, R67, R75.reuse, R12;                               /* 0x5980060004b7430c */
+        /*1598*/                   FFMA R14, R65.reuse, R75.reuse, R14;                         /* 0x5980070004b7410e */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*15a8*/                   FFMA R15, R65, R74.reuse, R15;                               /* 0x5980078004a7410f */
+        /*15b0*/                   FFMA R9, R66.reuse, R74.reuse, R9;                           /* 0x5980048004a74209 */
+        /*15b8*/                   FFMA R8, R66.reuse, R75.reuse, R8;                           /* 0x5980040004b74208 */
+                                                                                                /* 0x081fc404fe2207f0 */
+        /*15c8*/         {         FFMA R10, R64.reuse, R75, R10;                               /* 0x5980050004b7400a */
+        /*15d0*/               @P0 STS.128 [R118+0x400], R104;        }                         /* 0xef5e000040007668 */
+        /*15d8*/                   FFMA R11, R64.reuse, R74, R11;                               /* 0x5980058004a7400b */
+                                                                                                /* 0x181fc080fe2607e1 */
+        /*15e8*/                   FFMA R17, R66.reuse, R76.reuse, R17;                         /* 0x5980088004c74211 */
+        /*15f0*/                   FFMA R16, R66, R77.reuse, R16;                               /* 0x5980080004d74210 */
+        /*15f8*/         {         FFMA R18, R64.reuse, R77.reuse, R18;                         /* 0x5980090004d74012 */
+        /*1608*/               @P0 STS.128 [R118+0x600], R108;        }                         /* 0x181fc480fe2007f1 */
+                                                                                                /* 0xef5e00006000766c */
+        /*1610*/                   FFMA R19, R64, R76.reuse, R19;                               /* 0x5980098004c74013 */
+        /*1618*/                   FFMA R21, R67.reuse, R76.reuse, R21;                         /* 0x59800a8004c74315 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1628*/                   FFMA R20, R67, R77.reuse, R20;                               /* 0x59800a0004d74314 */
+        /*1630*/                   FFMA R22, R65.reuse, R77.reuse, R22;                         /* 0x59800b0004d74116 */
+        /*1638*/                   FFMA R23, R65, R76.reuse, R23;                               /* 0x59800b8004c74117 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1648*/                   FFMA R49, R70.reuse, R76.reuse, R49;                         /* 0x5980188004c74631 */
+        /*1650*/                   FFMA R48, R70, R77.reuse, R48;                               /* 0x5980180004d74630 */
+        /*1658*/                   FFMA R50, R68.reuse, R77.reuse, R50;                         /* 0x5980190004d74432 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*1668*/                   FFMA R51, R68, R76.reuse, R51;                               /* 0x5980198004c74433 */
+        /*1670*/                   FFMA R53, R71.reuse, R76.reuse, R53;                         /* 0x59801a8004c74735 */
+        /*1678*/                   FFMA R52, R71.reuse, R77.reuse, R52;                         /* 0x59801a0004d74734 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*1688*/                   FFMA R54, R69.reuse, R77, R54;                               /* 0x59801b0004d74536 */
+        /*1690*/                   FFMA R55, R69.reuse, R76, R55;                               /* 0x59801b8004c74537 */
+        /*1698*/                   FFMA R61, R71.reuse, R78.reuse, R61;                         /* 0x59801e8004e7473d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*16a8*/                   FFMA R60, R71, R79.reuse, R60;                               /* 0x59801e0004f7473c */
+        /*16b0*/                   FFMA R62, R69.reuse, R79.reuse, R62;                         /* 0x59801f0004f7453e */
+        /*16b8*/                   FFMA R63, R69, R78.reuse, R63;                               /* 0x59801f8004e7453f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*16c8*/                   FFMA R57, R70.reuse, R78.reuse, R57;                         /* 0x59801c8004e74639 */
+        /*16d0*/                   FFMA R56, R70, R79.reuse, R56;                               /* 0x59801c0004f74638 */
+        /*16d8*/                   FFMA R58, R68.reuse, R79.reuse, R58;                         /* 0x59801d0004f7443a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*16e8*/                   FFMA R59, R68, R78.reuse, R59;                               /* 0x59801d8004e7443b */
+        /*16f0*/                   FFMA R29, R67.reuse, R78.reuse, R29;                         /* 0x59800e8004e7431d */
+        /*16f8*/                   FFMA R28, R67, R79.reuse, R28;                               /* 0x59800e0004f7431c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1708*/                   FFMA R30, R65.reuse, R79.reuse, R30;                         /* 0x59800f0004f7411e */
+        /*1710*/                   FFMA R31, R65, R78.reuse, R31;                               /* 0x59800f8004e7411f */
+        /*1718*/                   FFMA R25, R66.reuse, R78.reuse, R25;                         /* 0x59800c8004e74219 */
+                                                                                                /* 0x003fd440fe0407f1 */
+        /*1728*/                   FFMA R24, R66, R79.reuse, R24;                               /* 0x59800c0004f74218 */
+        /*1730*/         {         FFMA R26, R64.reuse, R79, R26;                               /* 0x59800d0004f7401a */
+        /*1738*/                   BAR.SYNC 0x0;        }                                       /* 0xf0a81b8000070000 */
+                                                                                                /* 0x001fc400fe2007f1 */
+        /*1748*/               @P0 LOP.XOR R114, R114, 0x1000;                                  /* 0x3847040100007272 */
+        /*1750*/               @P0 LOP.XOR R115, R115, 0x1000;                                  /* 0x3847040100007373 */
+        /*1758*/               @P0 LOP.XOR R118, R118, 0x1000;                                  /* 0x3847040100007676 */
+                                                                                                /* 0x001fc4c0fe0007f1 */
+        /*1768*/                   FFMA R27, R64, R78, R27;                                     /* 0x59800d8004e7401b */
+        /*1770*/         {         FFMA R1, R82.reuse, R88.reuse, R1;                           /* 0x5980008005875201 */
+        /*1778*/               @P0 LDS.U.128 R64, [R114];        }                              /* 0xef4e100000007240 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*1788*/                   FFMA R0, R82, R89.reuse, R0;                                 /* 0x5980000005975200 */
+        /*1790*/         {         FFMA R2, R80.reuse, R89.reuse, R2;                           /* 0x5980010005975002 */
+        /*1798*/               @P0 LDS.U.128 R72, [R115];        }                              /* 0xef4e100000007348 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*17a8*/                   FFMA R3, R80, R88.reuse, R3;                                 /* 0x5980018005875003 */
+        /*17b0*/         {         FFMA R5, R83.reuse, R88.reuse, R5;                           /* 0x5980028005875305 */
+        /*17b8*/               @P0 LDS.U.128 R68, [R114+0x80];        }                         /* 0xef4e100008007244 */
+                                                                                                /* 0x001c44c0fe0407f1 */
+        /*17c8*/                   FFMA R4, R83, R89.reuse, R4;                                 /* 0x5980020005975304 */
+        /*17d0*/         {         FFMA R6, R81.reuse, R89.reuse, R6;                           /* 0x5980030005975106 */
+        /*17d8*/               @P0 LDS.U.128 R76, [R115+0x80];        }                         /* 0xef4e10000800734c */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*17e8*/                   FFMA R7, R81, R88.reuse, R7;                                 /* 0x5980038005875107 */
+        /*17f0*/                   FFMA R33, R86.reuse, R88.reuse, R33;                         /* 0x5980108005875621 */
+        /*17f8*/                   FFMA R32, R86, R89.reuse, R32;                               /* 0x5980100005975620 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1808*/                   FFMA R34, R84.reuse, R89.reuse, R34;                         /* 0x5980110005975422 */
+        /*1810*/                   FFMA R35, R84, R88.reuse, R35;                               /* 0x5980118005875423 */
+        /*1818*/                   FFMA R37, R87.reuse, R88.reuse, R37;                         /* 0x5980128005875725 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*1828*/                   FFMA R36, R87.reuse, R89.reuse, R36;                         /* 0x5980120005975724 */
+        /*1830*/                   FFMA R38, R85.reuse, R89, R38;                               /* 0x5980130005975526 */
+        /*1838*/                   FFMA R39, R85.reuse, R88, R39;                               /* 0x5980138005875527 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1848*/                   FFMA R45, R87.reuse, R90.reuse, R45;                         /* 0x5980168005a7572d */
+        /*1850*/                   FFMA R44, R87, R91.reuse, R44;                               /* 0x5980160005b7572c */
+        /*1858*/                   FFMA R46, R85.reuse, R91.reuse, R46;                         /* 0x5980170005b7552e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1868*/                   FFMA R47, R85, R90.reuse, R47;                               /* 0x5980178005a7552f */
+        /*1870*/                   FFMA R41, R86.reuse, R90.reuse, R41;                         /* 0x5980148005a75629 */
+        /*1878*/                   FFMA R40, R86, R91.reuse, R40;                               /* 0x5980140005b75628 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1888*/                   FFMA R42, R84.reuse, R91.reuse, R42;                         /* 0x5980150005b7542a */
+        /*1890*/                   FFMA R43, R84, R90.reuse, R43;                               /* 0x5980158005a7542b */
+        /*1898*/                   FFMA R13, R83.reuse, R90.reuse, R13;                         /* 0x5980068005a7530d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*18a8*/                   FFMA R12, R83, R91.reuse, R12;                               /* 0x5980060005b7530c */
+        /*18b0*/                   FFMA R14, R81.reuse, R91.reuse, R14;                         /* 0x5980070005b7510e */
+        /*18b8*/                   FFMA R15, R81, R90.reuse, R15;                               /* 0x5980078005a7510f */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*18c8*/                   FFMA R9, R82.reuse, R90.reuse, R9;                           /* 0x5980048005a75209 */
+        /*18d0*/                   FFMA R8, R82.reuse, R91.reuse, R8;                           /* 0x5980040005b75208 */
+        /*18d8*/                   FFMA R10, R80.reuse, R91, R10;                               /* 0x5980050005b7500a */
+                                                                                                /* 0x101fc4c0fc2207f1 */
+        /*18e8*/                   FFMA R11, R80.reuse, R90, R11;                               /* 0x5980058005a7500b */
+        /*18f0*/                   FFMA R17, R82.reuse, R92.reuse, R17;                         /* 0x5980088005c75211 */
+        /*18f8*/                   FFMA R16, R82, R93.reuse, R16;                               /* 0x5980080005d75210 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1908*/                   FFMA R18, R80.reuse, R93.reuse, R18;                         /* 0x5980090005d75012 */
+        /*1910*/                   FFMA R19, R80, R92.reuse, R19;                               /* 0x5980098005c75013 */
+        /*1918*/                   FFMA R21, R83.reuse, R92.reuse, R21;                         /* 0x59800a8005c75315 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1928*/                   FFMA R20, R83, R93.reuse, R20;                               /* 0x59800a0005d75314 */
+        /*1930*/                   FFMA R22, R81.reuse, R93.reuse, R22;                         /* 0x59800b0005d75116 */
+        /*1938*/                   FFMA R23, R81, R92.reuse, R23;                               /* 0x59800b8005c75117 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1948*/                   FFMA R49, R86.reuse, R92.reuse, R49;                         /* 0x5980188005c75631 */
+        /*1950*/                   FFMA R48, R86, R93.reuse, R48;                               /* 0x5980180005d75630 */
+        /*1958*/                   FFMA R50, R84.reuse, R93.reuse, R50;                         /* 0x5980190005d75432 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*1968*/                   FFMA R51, R84, R92.reuse, R51;                               /* 0x5980198005c75433 */
+        /*1970*/                   FFMA R53, R87.reuse, R92.reuse, R53;                         /* 0x59801a8005c75735 */
+        /*1978*/                   FFMA R52, R87.reuse, R93.reuse, R52;                         /* 0x59801a0005d75734 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*1988*/                   FFMA R54, R85.reuse, R93, R54;                               /* 0x59801b0005d75536 */
+        /*1990*/                   FFMA R55, R85.reuse, R92, R55;                               /* 0x59801b8005c75537 */
+        /*1998*/                   FFMA R61, R87.reuse, R94.reuse, R61;                         /* 0x59801e8005e7573d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*19a8*/                   FFMA R60, R87, R95.reuse, R60;                               /* 0x59801e0005f7573c */
+        /*19b0*/                   FFMA R62, R85.reuse, R95.reuse, R62;                         /* 0x59801f0005f7553e */
+        /*19b8*/                   FFMA R63, R85, R94.reuse, R63;                               /* 0x59801f8005e7553f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*19c8*/                   FFMA R57, R86.reuse, R94.reuse, R57;                         /* 0x59801c8005e75639 */
+        /*19d0*/                   FFMA R56, R86, R95.reuse, R56;                               /* 0x59801c0005f75638 */
+        /*19d8*/                   FFMA R58, R84.reuse, R95.reuse, R58;                         /* 0x59801d0005f7543a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*19e8*/                   FFMA R59, R84, R94.reuse, R59;                               /* 0x59801d8005e7543b */
+        /*19f0*/                   FFMA R29, R83.reuse, R94.reuse, R29;                         /* 0x59800e8005e7531d */
+        /*19f8*/                   FFMA R28, R83, R95.reuse, R28;                               /* 0x59800e0005f7531c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1a08*/                   FFMA R30, R81.reuse, R95.reuse, R30;                         /* 0x59800f0005f7511e */
+        /*1a10*/                   FFMA R31, R81, R94.reuse, R31;                               /* 0x59800f8005e7511f */
+        /*1a18*/                   FFMA R25, R82.reuse, R94.reuse, R25;                         /* 0x59800c8005e75219 */
+                                                                                                /* 0x001fc440fe2407f1 */
+        /*1a28*/                   FFMA R24, R82, R95.reuse, R24;                               /* 0x59800c0005f75218 */
+        /*1a30*/                   FFMA R26, R80.reuse, R95, R26;                               /* 0x59800d0005f7501a */
+        /*1a38*/                   FFMA R27, R80, R94, R27;                                     /* 0x59800d8005e7501b */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1a48*/               @P0 IADD R112, R112, R121.reuse;                                 /* 0x5c10000007907070 */
+        /*1a50*/               @P0 IADD R116, R116, R121.reuse;                                 /* 0x5c10000007907474 */
+        /*1a58*/               @P0 IADD R120, R120, R121.reuse;                                 /* 0x5c10000007907878 */
+                                                                                                /* 0x081fc400fca007f0 */
+        /*1a68*/         {     @P0 IADD R124, R124, R121;                                       /* 0x5c10000007907c7c */
+        /*1a70*/               @P0 BRA 0x310;        }                                          /* 0xe2400ffe8980000f */
+        /*1a78*/                   SHR.U32 R80, R123.reuse, 0x1;                                /* 0x3828000000177b50 */
+                                                                                                /* 0x001fc480fe2007f1 */
+        /*1a88*/                   MOV R81, c[0x0][0x158];                                      /* 0x4c98078005670051 */
+        /*1a90*/                   ISCADD R84, R125, R126.reuse, 0x6;                           /* 0x5c18030007e77d54 */
+        /*1a98*/                   MOV R72, c[0x0][0x15c];                                      /* 0x4c98078005770048 */
+                                                                                                /* 0x001fc400fe2007f1 */
+        /*1aa8*/                   ISCADD R92, R123, R126, 0x3;                                 /* 0x5c18018007e77b5c */
+        /*1ab0*/                   LOP.AND R114, R114, 0x7ff;                                   /* 0x384700007ff77272 */
+        /*1ab8*/                   ISCADD R80, R122, R80, 0x6;                                  /* 0x5c18030005077a50 */
+                                                                                                /* 0x001fc440fe2007f1 */
+        /*1ac8*/                   LOP.AND R115, R115, 0x7ff;                                   /* 0x384700007ff77373 */
+        /*1ad0*/                   SHL R77, R81.reuse, 0x2;                                     /* 0x384800000027514d */
+        /*1ad8*/                   ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT;                 /* 0x4b6303800517542f */
+                                                                                                /* 0x081fc400fe2207f1 */
+        /*1ae8*/                   SHL R89, R81.reuse, 0x4;                                     /* 0x3848000000475159 */
+        /*1af0*/                   FMUL R64, R3, R72;                                           /* 0x5c68000004870340 */
+        /*1af8*/                   SHL R91, R81.reuse, 0x5;                                     /* 0x384800000057515b */
+                                                                                                /* 0x001fc400fe2607f1 */
+        /*1b08*/                   XMAD.MRG R74, R80.reuse, R81.H1.reuse, RZ;                   /* 0x5b007fa80517504a */
+        /*1b10*/                   ISCADD R93, R115, R114, 0x4;                                 /* 0x5c1802000727735d */
+        /*1b18*/                   XMAD R73, R80, R81, R84;                                     /* 0x5b002a0005175049 */
+                                                                                                /* 0x001fc400fe2007f1 */
+        /*1b28*/                   SHL R92, R92, 0x2;                                           /* 0x3848000000275c5c */
+        /*1b30*/                   IADD R84, R84, 0x20;                                         /* 0x3810000002075454 */
+        /*1b38*/                   ISCADD R85, R81, -R89, 0x7;                                  /* 0x5c19038005975155 */
+                                                                                                /* 0x001fc480fe2407f1 */
+        /*1b48*/                   FMUL R65, R7, R72.reuse;                                     /* 0x5c68000004870741 */
+        /*1b50*/                   FMUL R66, R1, R72.reuse;                                     /* 0x5c68000004870142 */
+        /*1b58*/                   XMAD.PSL.CBCC R73, R80.H1, R74.H1, R73;                      /* 0x5b30249804a75049 */
+                                                                                                /* 0x101fc400fe2007f1 */
+        /*1b68*/                   IADD R80, R80, -0x1;                                         /* 0x3910007ffff75050 */
+        /*1b70*/                   ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT;                 /* 0x4b63038005175437 */
+        /*1b78*/                   FMUL R67, R5, R72.reuse;                                     /* 0x5c68000004870543 */
+                                                                                                /* 0x001fc480fe2407f1 */
+        /*1b88*/                   FMUL R68, R35, R72.reuse;                                    /* 0x5c68000004872344 */
+        /*1b90*/                   FMUL R69, R39, R72.reuse;                                    /* 0x5c68000004872745 */
+        /*1b98*/                   ISCADD R76, R73, c[0x0][0x140], 0x2;                         /* 0x4c1801000507494c */
+                                                                                                /* 0x001fc440fe2207f1 */
+        /*1ba8*/                   IADD R86, R80.reuse, 0x4;                                    /* 0x3810000000475056 */
+        /*1bb0*/                   IADD R87, R80.reuse, 0x8;                                    /* 0x3810000000875057 */
+        /*1bb8*/                   IADD R88, R80, 0xc;                                          /* 0x3810000000c75058 */
+                                                                                                /* 0x001f9800fe2407f1 */
+        /*1bc8*/                   FMUL R70, R33, R72.reuse;                                    /* 0x5c68000004872146 */
+        /*1bd0*/                   FMUL R71, R37, R72;                                          /* 0x5c68000004872547 */
+        /*1bd8*/                   IADD R76, R76, -R77;                                         /* 0x5c11000004d74c4c */
+                                                                                                /* 0x001fc080fca207f1 */
+        /*1be8*/                   IADD R75, R76.reuse, R89;                                    /* 0x5c10000005974c4b */
+        /*1bf0*/                   IADD R78, R76, R91.reuse;                                    /* 0x5c10000005b74c4e */
+        /*1bf8*/         {         IADD R79, R75, R91;                                          /* 0x5c10000005b74b4f */
+        /*1c08*/                   CAL 0x1f10;        }                                         /* 0x101fc482fe2007f5 */
+                                                                                                /* 0xe260000030000040 */
+        /*1c10*/                   FMUL R64, R2, R72.reuse;                                     /* 0x5c68000004870240 */
+        /*1c18*/                   FMUL R65, R6, R72.reuse;                                     /* 0x5c68000004870641 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1c28*/                   FMUL R66, R0, R72.reuse;                                     /* 0x5c68000004870042 */
+        /*1c30*/                   FMUL R67, R4, R72.reuse;                                     /* 0x5c68000004870443 */
+        /*1c38*/                   FMUL R68, R34, R72.reuse;                                    /* 0x5c68000004872244 */
+                                                                                                /* 0x001fc080fe2407f1 */
+        /*1c48*/                   FMUL R69, R38, R72.reuse;                                    /* 0x5c68000004872645 */
+        /*1c50*/                   FMUL R70, R32, R72.reuse;                                    /* 0x5c68000004872046 */
+        /*1c58*/         {         FMUL R71, R36, R72;                                          /* 0x5c68000004872447 */
+        /*1c68*/                   CAL 0x1f10;        }                                         /* 0x101fc482fe2007f5 */
+                                                                                                /* 0xe26000002a000040 */
+        /*1c70*/                   FMUL R64, R11, R72.reuse;                                    /* 0x5c68000004870b40 */
+        /*1c78*/                   FMUL R65, R15, R72.reuse;                                    /* 0x5c68000004870f41 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1c88*/                   FMUL R66, R9, R72.reuse;                                     /* 0x5c68000004870942 */
+        /*1c90*/                   FMUL R67, R13, R72.reuse;                                    /* 0x5c68000004870d43 */
+        /*1c98*/                   FMUL R68, R43, R72.reuse;                                    /* 0x5c68000004872b44 */
+                                                                                                /* 0x001fc080fe2407f1 */
+        /*1ca8*/                   FMUL R69, R47, R72.reuse;                                    /* 0x5c68000004872f45 */
+        /*1cb0*/                   FMUL R70, R41, R72.reuse;                                    /* 0x5c68000004872946 */
+        /*1cb8*/         {         FMUL R71, R45, R72;                                          /* 0x5c68000004872d47 */
+        /*1cc8*/                   CAL 0x1f10;        }                                         /* 0x101fc482fe2007f5 */
+                                                                                                /* 0xe260000024000040 */
+        /*1cd0*/                   FMUL R64, R10, R72.reuse;                                    /* 0x5c68000004870a40 */
+        /*1cd8*/                   FMUL R65, R14, R72.reuse;                                    /* 0x5c68000004870e41 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1ce8*/                   FMUL R66, R8, R72.reuse;                                     /* 0x5c68000004870842 */
+        /*1cf0*/                   FMUL R67, R12, R72.reuse;                                    /* 0x5c68000004870c43 */
+        /*1cf8*/                   FMUL R68, R42, R72.reuse;                                    /* 0x5c68000004872a44 */
+                                                                                                /* 0x001fc080fe2407f1 */
+        /*1d08*/                   FMUL R69, R46, R72.reuse;                                    /* 0x5c68000004872e45 */
+        /*1d10*/                   FMUL R70, R40, R72.reuse;                                    /* 0x5c68000004872846 */
+        /*1d18*/         {         FMUL R71, R44, R72;                                          /* 0x5c68000004872c47 */
+        /*1d28*/                   CAL 0x1f10;        }                                         /* 0x001fc400fe2007f5 */
+                                                                                                /* 0xe26000001e000040 */
+        /*1d30*/                   IADD R80, R80, 0x1c;                                         /* 0x3810000001c75050 */
+        /*1d38*/                   IADD R86, R86, 0x1c;                                         /* 0x3810000001c75656 */
+                                                                                                /* 0x105fc400fe2007f1 */
+        /*1d48*/                   IADD R87, R87, 0x1c;                                         /* 0x3810000001c75757 */
+        /*1d50*/                   IADD R88, R88, 0x1c;                                         /* 0x3810000001c75858 */
+        /*1d58*/                   IADD R76, R76, R85.reuse;                                    /* 0x5c10000005574c4c */
+                                                                                                /* 0x001fc480fe2407f1 */
+        /*1d68*/                   IADD R75, R75, R85.reuse;                                    /* 0x5c10000005574b4b */
+        /*1d70*/                   IADD R78, R78, R85.reuse;                                    /* 0x5c10000005574e4e */
+        /*1d78*/                   IADD R79, R79, R85;                                          /* 0x5c10000005574f4f */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1d88*/                   FMUL R64, R19, R72.reuse;                                    /* 0x5c68000004871340 */
+        /*1d90*/                   FMUL R65, R23, R72.reuse;                                    /* 0x5c68000004871741 */
+        /*1d98*/                   FMUL R66, R17, R72.reuse;                                    /* 0x5c68000004871142 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1da8*/                   FMUL R67, R21, R72.reuse;                                    /* 0x5c68000004871543 */
+        /*1db0*/                   FMUL R68, R51, R72.reuse;                                    /* 0x5c68000004873344 */
+        /*1db8*/                   FMUL R69, R55, R72.reuse;                                    /* 0x5c68000004873745 */
+                                                                                                /* 0x001fd400fe0407f1 */
+        /*1dc8*/                   FMUL R70, R49, R72.reuse;                                    /* 0x5c68000004873146 */
+        /*1dd0*/         {         FMUL R71, R53, R72;                                          /* 0x5c68000004873547 */
+        /*1dd8*/                   CAL 0x1f10;        }                                         /* 0xe260000013000040 */
+                                                                                                /* 0x101fc480fe2417f1 */
+        /*1de8*/                   FMUL R64, R18, R72.reuse;                                    /* 0x5c68000004871240 */
+        /*1df0*/                   FMUL R65, R22, R72.reuse;                                    /* 0x5c68000004871641 */
+        /*1df8*/                   FMUL R66, R16, R72.reuse;                                    /* 0x5c68000004871042 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1e08*/                   FMUL R67, R20, R72.reuse;                                    /* 0x5c68000004871443 */
+        /*1e10*/                   FMUL R68, R50, R72.reuse;                                    /* 0x5c68000004873244 */
+        /*1e18*/                   FMUL R69, R54, R72.reuse;                                    /* 0x5c68000004873645 */
+                                                                                                /* 0x001fd400fe0407f1 */
+        /*1e28*/                   FMUL R70, R48, R72.reuse;                                    /* 0x5c68000004873046 */
+        /*1e30*/         {         FMUL R71, R52, R72;                                          /* 0x5c68000004873447 */
+        /*1e38*/                   CAL 0x1f10;        }                                         /* 0xe26000000d000040 */
+                                                                                                /* 0x101fc480fe2417f1 */
+        /*1e48*/                   FMUL R64, R27, R72.reuse;                                    /* 0x5c68000004871b40 */
+        /*1e50*/                   FMUL R65, R31, R72.reuse;                                    /* 0x5c68000004871f41 */
+        /*1e58*/                   FMUL R66, R25, R72.reuse;                                    /* 0x5c68000004871942 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1e68*/                   FMUL R67, R29, R72.reuse;                                    /* 0x5c68000004871d43 */
+        /*1e70*/                   FMUL R68, R59, R72.reuse;                                    /* 0x5c68000004873b44 */
+        /*1e78*/                   FMUL R69, R63, R72.reuse;                                    /* 0x5c68000004873f45 */
+                                                                                                /* 0x001fd400fe0407f1 */
+        /*1e88*/                   FMUL R70, R57, R72.reuse;                                    /* 0x5c68000004873946 */
+        /*1e90*/         {         FMUL R71, R61, R72;                                          /* 0x5c68000004873d47 */
+        /*1e98*/                   CAL 0x1f10;        }                                         /* 0xe260000007000040 */
+                                                                                                /* 0x101fc480fe2417f1 */
+        /*1ea8*/                   FMUL R64, R26, R72.reuse;                                    /* 0x5c68000004871a40 */
+        /*1eb0*/                   FMUL R65, R30, R72.reuse;                                    /* 0x5c68000004871e41 */
+        /*1eb8*/                   FMUL R66, R24, R72.reuse;                                    /* 0x5c68000004871842 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1ec8*/                   FMUL R67, R28, R72.reuse;                                    /* 0x5c68000004871c43 */
+        /*1ed0*/                   FMUL R68, R58, R72.reuse;                                    /* 0x5c68000004873a44 */
+        /*1ed8*/                   FMUL R69, R62, R72.reuse;                                    /* 0x5c68000004873e45 */
+                                                                                                /* 0x001fd400fe0407f1 */
+        /*1ee8*/                   FMUL R70, R56, R72.reuse;                                    /* 0x5c68000004873846 */
+        /*1ef0*/         {         FMUL R71, R60, R72;                                          /* 0x5c68000004873c47 */
+        /*1ef8*/                   CAL 0x1f10;        }                                         /* 0xe260000001000040 */
+                                                                                                /* 0x001fc400fe0007f5 */
+        /*1f08*/                   EXIT;                                                        /* 0xe30000000007000f */
+        /*1f10*/         {         IADD R80, R80, 0x1;                                          /* 0x3810000000175050 */
+        /*1f18*/                   STS.128 [R93], R64;        }                                 /* 0xef5e000000075d40 */
+                                                                                                /* 0x001fc000fe2007f0 */
+        /*1f28*/         {         IADD R86, R86, 0x1;                                          /* 0x3810000000175656 */
+        /*1f30*/                   STS.128 [R93+0x80], R68;        }                            /* 0xef5e000008075d44 */
+        /*1f38*/         {         IADD R87, R87, 0x1;                                          /* 0x3810000000175757 */
+        /*1f48*/                   LDS R64, [R92];        }                                     /* 0x001fc400fe0007f1 */
+                                                                                                /* 0xef4c000000075c40 */
+        /*1f50*/         {         IADD R88, R88, 0x1;                                          /* 0x3810000000175858 */
+        /*1f58*/                   LDS R65, [R92+0x80];        }                                /* 0xef4c000008075c41 */
+                                                                                                /* 0x101fc000fe2407f0 */
+        /*1f68*/         {         IADD R76, R76, R77.reuse;                                    /* 0x5c10000004d74c4c */
+        /*1f70*/                   LDS R66, [R92+0x100];        }                               /* 0xef4c000010075c42 */
+        /*1f78*/         {         IADD R75, R75, R77.reuse;                                    /* 0x5c10000004d74b4b */
+        /*1f88*/                   LDS R67, [R92+0x180];        }                               /* 0x001fc480fe0007f1 */
+                                                                                                /* 0xef4c000018075c43 */
+        /*1f90*/         {         IADD R78, R78, R77.reuse;                                    /* 0x5c10000004d74e4e */
+        /*1f98*/                   LDS R68, [R92+0x200];        }                               /* 0xef4c000020075c44 */
+                                                                                                /* 0x081fc000fe2007f0 */
+        /*1fa8*/         {         IADD R79, R79, R77;                                          /* 0x5c10000004d74f4f */
+        /*1fb0*/                   LDS R69, [R92+0x280];        }                               /* 0xef4c000028075c45 */
+        /*1fb8*/         {         ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5;           /* 0x4b63028005275007 */
+        /*1fc8*/                   LDS R70, [R92+0x300];        }                               /* 0x001c4400fe0007f1 */
+                                                                                                /* 0xef4c000030075c46 */
+        /*1fd0*/         {         ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6;                 /* 0x4b6303000527500f */
+        /*1fd8*/                   LDS R71, [R92+0x380];        }                               /* 0xef4c000038075c47 */
+                                                                                                /* 0x003fc400fd2207f2 */
+        /*1fe8*/                   ISETP.LT.AND P2, PT, R86.reuse, c[0x0][0x148], P5;           /* 0x4b63028005275617 */
+        /*1ff0*/                   ISETP.LT.AND P3, PT, R86, c[0x0][0x148], P6;                 /* 0x4b6303000527561f */
+        /*1ff8*/               @P0 STG.CG [R76], R64;                                           /* 0xeedc400000004c40 */
+                                                                                                /* 0x001fc000fe2207f0 */
+        /*2008*/         {         ISETP.LT.AND P0, PT, R87.reuse, c[0x0][0x148], P5;           /* 0x4b63028005275707 */
+        /*2010*/               @P1 STG.CG [R76+0x80], R65;        }                             /* 0xeedc400008014c41 */
+        /*2018*/         {         ISETP.LT.AND P1, PT, R87, c[0x0][0x148], P6;                 /* 0x4b6303000527570f */
+        /*2028*/               @P2 STG.CG [R75], R66;        }                                  /* 0x001fc440fe2007f1 */
+                                                                                                /* 0xeedc400000024b42 */
+        /*2030*/                   ISETP.LT.AND P2, PT, R88.reuse, c[0x0][0x148], P5;           /* 0x4b63028005275817 */
+        /*2038*/               @P3 STG.CG [R75+0x80], R67;                                      /* 0xeedc400008034b43 */
+                                                                                                /* 0x001fc400fe2007e9 */
+        /*2048*/                   ISETP.LT.AND P3, PT, R88, c[0x0][0x148], P6;                 /* 0x4b6303000527581f */
+        /*2050*/               @P0 STG.CG [R78], R68;                                           /* 0xeedc400000004e44 */
+        /*2058*/               @P1 STG.CG [R78+0x80], R69;                                      /* 0xeedc400008014e45 */
+                                                                                                /* 0x001fd4003e2007f2 */
+        /*2068*/               @P2 STG.CG [R79], R70;                                           /* 0xeedc400000024f46 */
+        /*2070*/               @P3 STG.CG [R79+0x80], R71;                                      /* 0xeedc400008034f47 */
+        /*2078*/                   RET;                                                         /* 0xe32000000007000f */
+                                                                                                /* 0x001f8000fc0007ff */
+        /*2088*/                   BRA 0x2088;                                                  /* 0xe2400fffff87000f */
+        /*2090*/                   NOP;                                                         /* 0x50b0000000070f00 */
+        /*2098*/                   NOP;                                                         /* 0x50b0000000070f00 */
+                                                                                                /* 0x001f8000fc0007e0 */
+        /*20a8*/                   NOP;                                                         /* 0x50b0000000070f00 */
+        /*20b0*/                   NOP;                                                         /* 0x50b0000000070f00 */
+        /*20b8*/                   NOP;                                                         /* 0x50b0000000070f00 */
+		................................
+
+
diff --git a/Assembler/MaxAs/t/MaxAs-MaxAs.t b/Assembler/MaxAs/t/MaxAs-MaxAs.t
new file mode 100644
index 0000000..ad9e988
--- /dev/null
+++ b/Assembler/MaxAs/t/MaxAs-MaxAs.t
@@ -0,0 +1,5 @@
+use strict;
+use warnings;
+
+use Test::More tests => 1;
+BEGIN { use_ok('MaxAs::MaxAs') };
diff --git a/Assembler/PascalAs/Changes b/Assembler/PascalAs/Changes
new file mode 100644
index 0000000..a6d8a13
--- /dev/null
+++ b/Assembler/PascalAs/Changes
@@ -0,0 +1,4 @@
+Revision history for Perl extension MaxAs::MaxAs.
+
+1.01  Thu Mar 26 17:09:57 2015
+	- original Perl packaged version
diff --git a/Assembler/PascalAs/Install.sh b/Assembler/PascalAs/Install.sh
new file mode 100755
index 0000000..57c8d24
--- /dev/null
+++ b/Assembler/PascalAs/Install.sh
@@ -0,0 +1,3 @@
+perl Makefile.PL
+make
+sudo make install
diff --git a/Assembler/PascalAs/LICENSE b/Assembler/PascalAs/LICENSE
new file mode 100644
index 0000000..6c28fad
--- /dev/null
+++ b/Assembler/PascalAs/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Assembler/PascalAs/MANIFEST b/Assembler/PascalAs/MANIFEST
new file mode 100644
index 0000000..a25084c
--- /dev/null
+++ b/Assembler/PascalAs/MANIFEST
@@ -0,0 +1,38 @@
+bin/maxas.pl
+Changes
+lib/MaxAs/Cubin.pm
+lib/MaxAs/MaxAs.pm
+lib/MaxAs/MaxAsGrammar.pm
+LICENSE
+Makefile.PL
+MANIFEST
+microbench/microbench.cpp
+microbench/microbench.cu
+microbench/microbench.sass
+microbench/shared.pl
+microbench/shared_lds.sass
+microbench/shared_sts16.sass
+microbench/throughput.pl
+microbench/throughput.sass
+microbench/throughput2.pl
+microbench/throughput2.sass
+microbench/throughput3.pl
+microbench/throughput4.pl
+microbench/throughput5.pl
+microbench/xmad.pl
+microbench/xmad2.sass
+README.md
+sgemm/batched_gemm.xlsx
+sgemm/cublas_sgemm.ptx
+sgemm/sgemm.cpp
+sgemm/sgemm.cu
+sgemm/sgemm.pl
+sgemm/sgemm.sln
+sgemm/sgemm.vcxproj
+sgemm/sgemm128.sass
+sgemm/sgemm64.sass
+sgemm/sgemm_final_128.sass
+sgemm/sgemm_final_64.sass
+sgemm/sgemm_pre_128.sass
+sgemm/sgemm_pre_64.sass
+t/MaxAs-MaxAs.t
diff --git a/Assembler/PascalAs/MYMETA.json b/Assembler/PascalAs/MYMETA.json
new file mode 100644
index 0000000..ee7458f
--- /dev/null
+++ b/Assembler/PascalAs/MYMETA.json
@@ -0,0 +1,42 @@
+{
+   "abstract" : "Assembler for NVIDIA Maxwell architecture",
+   "author" : [
+      "Scott Gray <sgray@nervanasys.com>"
+   ],
+   "dynamic_config" : 0,
+   "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001",
+   "license" : [
+      "mit"
+   ],
+   "meta-spec" : {
+      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+      "version" : "2"
+   },
+   "name" : "PascalAs-PascalAs",
+   "no_index" : {
+      "directory" : [
+         "t",
+         "inc"
+      ]
+   },
+   "prereqs" : {
+      "build" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "configure" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "runtime" : {
+         "requires" : {
+            "Carp" : "1.29",
+            "Data::Dumper" : "2.145"
+         }
+      }
+   },
+   "release_status" : "stable",
+   "version" : "1.06"
+}
diff --git a/Assembler/PascalAs/MYMETA.yml b/Assembler/PascalAs/MYMETA.yml
new file mode 100644
index 0000000..77a3de3
--- /dev/null
+++ b/Assembler/PascalAs/MYMETA.yml
@@ -0,0 +1,23 @@
+---
+abstract: 'Assembler for NVIDIA Maxwell architecture'
+author:
+  - 'Scott Gray <sgray@nervanasys.com>'
+build_requires:
+  ExtUtils::MakeMaker: '0'
+configure_requires:
+  ExtUtils::MakeMaker: '0'
+dynamic_config: 0
+generated_by: 'ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001'
+license: mit
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: '1.4'
+name: PascalAs-PascalAs
+no_index:
+  directory:
+    - t
+    - inc
+requires:
+  Carp: '1.29'
+  Data::Dumper: '2.145'
+version: '1.06'
diff --git a/Assembler/PascalAs/Makefile b/Assembler/PascalAs/Makefile
new file mode 100644
index 0000000..bef3fb2
--- /dev/null
+++ b/Assembler/PascalAs/Makefile
@@ -0,0 +1,878 @@
+# This Makefile is for the PascalAs::PascalAs extension to perl.
+#
+# It was generated automatically by MakeMaker version
+# 7.0401 (Revision: 70401) from the contents of
+# Makefile.PL. Don't edit this file, edit Makefile.PL instead.
+#
+#       ANY CHANGES MADE HERE WILL BE LOST!
+#
+#   MakeMaker ARGV: ()
+#
+
+#   MakeMaker Parameters:
+
+#     ABSTRACT_FROM => q[lib/PascalAs/PascalAs.pm]
+#     AUTHOR => [q[Scott Gray <sgray@nervanasys.com>]]
+#     BUILD_REQUIRES => {  }
+#     CONFIGURE_REQUIRES => {  }
+#     EXE_FILES => [q[bin/pascalas.pl]]
+#     LICENSE => q[MIT]
+#     NAME => q[PascalAs::PascalAs]
+#     PREREQ_PM => { Carp=>q[1.29], Data::Dumper=>q[2.145] }
+#     TEST_REQUIRES => {  }
+#     VERSION_FROM => q[lib/PascalAs/PascalAs.pm]
+
+# --- MakeMaker post_initialize section:
+
+
+# --- MakeMaker const_config section:
+
+# These definitions are from config.sh (via /usr/lib/x86_64-linux-gnu/perl/5.22/Config.pm).
+# They may have been overridden via Makefile.PL or on the command line.
+AR = ar
+CC = x86_64-linux-gnu-gcc
+CCCDLFLAGS = -fPIC
+CCDLFLAGS = -Wl,-E
+DLEXT = so
+DLSRC = dl_dlopen.xs
+EXE_EXT = 
+FULL_AR = /usr/bin/ar
+LD = x86_64-linux-gnu-gcc
+LDDLFLAGS = -shared -L/usr/local/lib -fstack-protector-strong
+LDFLAGS =  -fstack-protector-strong -L/usr/local/lib
+LIBC = libc-2.21.so
+LIB_EXT = .a
+OBJ_EXT = .o
+OSNAME = linux
+OSVERS = 3.16.0
+RANLIB = :
+SITELIBEXP = /usr/local/share/perl/5.22.1
+SITEARCHEXP = /usr/local/lib/x86_64-linux-gnu/perl/5.22.1
+SO = so
+VENDORARCHEXP = /usr/lib/x86_64-linux-gnu/perl5/5.22
+VENDORLIBEXP = /usr/share/perl5
+
+
+# --- MakeMaker constants section:
+AR_STATIC_ARGS = cr
+DIRFILESEP = /
+DFSEP = $(DIRFILESEP)
+NAME = PascalAs::PascalAs
+NAME_SYM = PascalAs_PascalAs
+VERSION = 1.06
+VERSION_MACRO = VERSION
+VERSION_SYM = 1_06
+DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\"
+XS_VERSION = 1.06
+XS_VERSION_MACRO = XS_VERSION
+XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\"
+INST_ARCHLIB = blib/arch
+INST_SCRIPT = blib/script
+INST_BIN = blib/bin
+INST_LIB = blib/lib
+INST_MAN1DIR = blib/man1
+INST_MAN3DIR = blib/man3
+MAN1EXT = 1p
+MAN3EXT = 3pm
+INSTALLDIRS = site
+DESTDIR = 
+PREFIX = $(SITEPREFIX)
+PERLPREFIX = /usr
+SITEPREFIX = /usr/local
+VENDORPREFIX = /usr
+INSTALLPRIVLIB = /usr/share/perl/5.22
+DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB)
+INSTALLSITELIB = /usr/local/share/perl/5.22.1
+DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB)
+INSTALLVENDORLIB = /usr/share/perl5
+DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB)
+INSTALLARCHLIB = /usr/lib/x86_64-linux-gnu/perl/5.22
+DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB)
+INSTALLSITEARCH = /usr/local/lib/x86_64-linux-gnu/perl/5.22.1
+DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH)
+INSTALLVENDORARCH = /usr/lib/x86_64-linux-gnu/perl5/5.22
+DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH)
+INSTALLBIN = /usr/bin
+DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN)
+INSTALLSITEBIN = /usr/local/bin
+DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN)
+INSTALLVENDORBIN = /usr/bin
+DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN)
+INSTALLSCRIPT = /usr/bin
+DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT)
+INSTALLSITESCRIPT = /usr/local/bin
+DESTINSTALLSITESCRIPT = $(DESTDIR)$(INSTALLSITESCRIPT)
+INSTALLVENDORSCRIPT = /usr/bin
+DESTINSTALLVENDORSCRIPT = $(DESTDIR)$(INSTALLVENDORSCRIPT)
+INSTALLMAN1DIR = /usr/share/man/man1
+DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR)
+INSTALLSITEMAN1DIR = /usr/local/man/man1
+DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR)
+INSTALLVENDORMAN1DIR = /usr/share/man/man1
+DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR)
+INSTALLMAN3DIR = /usr/share/man/man3
+DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR)
+INSTALLSITEMAN3DIR = /usr/local/man/man3
+DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR)
+INSTALLVENDORMAN3DIR = /usr/share/man/man3
+DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR)
+PERL_LIB = /usr/share/perl/5.22
+PERL_ARCHLIB = /usr/lib/x86_64-linux-gnu/perl/5.22
+PERL_ARCHLIBDEP = /usr/lib/x86_64-linux-gnu/perl/5.22
+LIBPERL_A = libperl.a
+FIRST_MAKEFILE = Makefile
+MAKEFILE_OLD = Makefile.old
+MAKE_APERL_FILE = Makefile.aperl
+PERLMAINCC = $(CC)
+PERL_INC = /usr/lib/x86_64-linux-gnu/perl/5.22/CORE
+PERL_INCDEP = /usr/lib/x86_64-linux-gnu/perl/5.22/CORE
+PERL = "/usr/bin/perl"
+FULLPERL = "/usr/bin/perl"
+ABSPERL = $(PERL)
+PERLRUN = $(PERL)
+FULLPERLRUN = $(FULLPERL)
+ABSPERLRUN = $(ABSPERL)
+PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
+FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
+ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)"
+PERL_CORE = 0
+PERM_DIR = 755
+PERM_RW = 644
+PERM_RWX = 755
+
+MAKEMAKER   = /usr/share/perl/5.22/ExtUtils/MakeMaker.pm
+MM_VERSION  = 7.0401
+MM_REVISION = 70401
+
+# FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle).
+# BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle)
+# PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar)
+# DLBASE  = Basename part of dynamic library. May be just equal BASEEXT.
+MAKE = make
+FULLEXT = PascalAs/PascalAs
+BASEEXT = PascalAs
+PARENT_NAME = PascalAs
+DLBASE = $(BASEEXT)
+VERSION_FROM = lib/PascalAs/PascalAs.pm
+OBJECT = 
+LDFROM = $(OBJECT)
+LINKTYPE = dynamic
+BOOTDEP = 
+
+# Handy lists of source code files:
+XS_FILES = 
+C_FILES  = 
+O_FILES  = 
+H_FILES  = 
+MAN1PODS = 
+MAN3PODS = lib/PascalAs/PascalAs.pm
+
+# Where is the Config information that we are using/depend on
+CONFIGDEP = $(PERL_ARCHLIBDEP)$(DFSEP)Config.pm $(PERL_INCDEP)$(DFSEP)config.h
+
+# Where to build things
+INST_LIBDIR      = $(INST_LIB)/PascalAs
+INST_ARCHLIBDIR  = $(INST_ARCHLIB)/PascalAs
+
+INST_AUTODIR     = $(INST_LIB)/auto/$(FULLEXT)
+INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT)
+
+INST_STATIC      = 
+INST_DYNAMIC     = 
+INST_BOOT        = 
+
+# Extra linker info
+EXPORT_LIST        = 
+PERL_ARCHIVE       = 
+PERL_ARCHIVEDEP    = 
+PERL_ARCHIVE_AFTER = 
+
+
+TO_INST_PM = lib/PascalAs/Cubin.pm \
+	lib/PascalAs/PascalAs.pm \
+	lib/PascalAs/PascalAsGrammar.pm
+
+PM_TO_BLIB = lib/PascalAs/Cubin.pm \
+	blib/lib/PascalAs/Cubin.pm \
+	lib/PascalAs/PascalAs.pm \
+	blib/lib/PascalAs/PascalAs.pm \
+	lib/PascalAs/PascalAsGrammar.pm \
+	blib/lib/PascalAs/PascalAsGrammar.pm
+
+
+# --- MakeMaker platform_constants section:
+MM_Unix_VERSION = 7.0401
+PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc
+
+
+# --- MakeMaker tool_autosplit section:
+# Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto
+AUTOSPLITFILE = $(ABSPERLRUN)  -e 'use AutoSplit;  autosplit($$$$ARGV[0], $$$$ARGV[1], 0, 1, 1)' --
+
+
+
+# --- MakeMaker tool_xsubpp section:
+
+
+# --- MakeMaker tools_other section:
+SHELL = /bin/sh
+CHMOD = chmod
+CP = cp
+MV = mv
+NOOP = $(TRUE)
+NOECHO = @
+RM_F = rm -f
+RM_RF = rm -rf
+TEST_F = test -f
+TOUCH = touch
+UMASK_NULL = umask 0
+DEV_NULL = > /dev/null 2>&1
+MKPATH = $(ABSPERLRUN) -MExtUtils::Command -e 'mkpath' --
+EQUALIZE_TIMESTAMP = $(ABSPERLRUN) -MExtUtils::Command -e 'eqtime' --
+FALSE = false
+TRUE = true
+ECHO = echo
+ECHO_N = echo -n
+UNINST = 0
+VERBINST = 0
+MOD_INSTALL = $(ABSPERLRUN) -MExtUtils::Install -e 'install([ from_to => {@ARGV}, verbose => '\''$(VERBINST)'\'', uninstall_shadows => '\''$(UNINST)'\'', dir_mode => '\''$(PERM_DIR)'\'' ]);' --
+DOC_INSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'perllocal_install' --
+UNINSTALL = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'uninstall' --
+WARN_IF_OLD_PACKLIST = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'warn_if_old_packlist' --
+MACROSTART = 
+MACROEND = 
+USEMAKEFILE = -f
+FIXIN = $(ABSPERLRUN) -MExtUtils::MY -e 'MY->fixin(shift)' --
+CP_NONEMPTY = $(ABSPERLRUN) -MExtUtils::Command::MM -e 'cp_nonempty' --
+
+
+# --- MakeMaker makemakerdflt section:
+makemakerdflt : all
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker dist section:
+TAR = tar
+TARFLAGS = cvf
+ZIP = zip
+ZIPFLAGS = -r
+COMPRESS = gzip --best
+SUFFIX = .gz
+SHAR = shar
+PREOP = $(NOECHO) $(NOOP)
+POSTOP = $(NOECHO) $(NOOP)
+TO_UNIX = $(NOECHO) $(NOOP)
+CI = ci -u
+RCS_LABEL = rcs -Nv$(VERSION_SYM): -q
+DIST_CP = best
+DIST_DEFAULT = tardist
+DISTNAME = PascalAs-PascalAs
+DISTVNAME = PascalAs-PascalAs-1.06
+
+
+# --- MakeMaker macro section:
+
+
+# --- MakeMaker depend section:
+
+
+# --- MakeMaker cflags section:
+
+
+# --- MakeMaker const_loadlibs section:
+
+
+# --- MakeMaker const_cccmd section:
+
+
+# --- MakeMaker post_constants section:
+
+
+# --- MakeMaker pasthru section:
+
+PASTHRU = LIBPERL_A="$(LIBPERL_A)"\
+	LINKTYPE="$(LINKTYPE)"\
+	LD="$(LD)"\
+	PREFIX="$(PREFIX)"
+
+
+# --- MakeMaker special_targets section:
+.SUFFIXES : .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT)
+
+.PHONY: all config static dynamic test linkext manifest blibdirs clean realclean disttest distdir
+
+
+
+# --- MakeMaker c_o section:
+
+
+# --- MakeMaker xs_c section:
+
+
+# --- MakeMaker xs_o section:
+
+
+# --- MakeMaker top_targets section:
+all :: pure_all manifypods
+	$(NOECHO) $(NOOP)
+
+
+pure_all :: config pm_to_blib subdirs linkext
+	$(NOECHO) $(NOOP)
+
+subdirs :: $(MYEXTLIB)
+	$(NOECHO) $(NOOP)
+
+config :: $(FIRST_MAKEFILE) blibdirs
+	$(NOECHO) $(NOOP)
+
+help :
+	perldoc ExtUtils::MakeMaker
+
+
+# --- MakeMaker blibdirs section:
+blibdirs : $(INST_LIBDIR)$(DFSEP).exists $(INST_ARCHLIB)$(DFSEP).exists $(INST_AUTODIR)$(DFSEP).exists $(INST_ARCHAUTODIR)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists $(INST_SCRIPT)$(DFSEP).exists $(INST_MAN1DIR)$(DFSEP).exists $(INST_MAN3DIR)$(DFSEP).exists
+	$(NOECHO) $(NOOP)
+
+# Backwards compat with 6.18 through 6.25
+blibdirs.ts : blibdirs
+	$(NOECHO) $(NOOP)
+
+$(INST_LIBDIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_LIBDIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_LIBDIR)
+	$(NOECHO) $(TOUCH) $(INST_LIBDIR)$(DFSEP).exists
+
+$(INST_ARCHLIB)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_ARCHLIB)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHLIB)
+	$(NOECHO) $(TOUCH) $(INST_ARCHLIB)$(DFSEP).exists
+
+$(INST_AUTODIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_AUTODIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_AUTODIR)
+	$(NOECHO) $(TOUCH) $(INST_AUTODIR)$(DFSEP).exists
+
+$(INST_ARCHAUTODIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_ARCHAUTODIR)
+	$(NOECHO) $(TOUCH) $(INST_ARCHAUTODIR)$(DFSEP).exists
+
+$(INST_BIN)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_BIN)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_BIN)
+	$(NOECHO) $(TOUCH) $(INST_BIN)$(DFSEP).exists
+
+$(INST_SCRIPT)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_SCRIPT)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_SCRIPT)
+	$(NOECHO) $(TOUCH) $(INST_SCRIPT)$(DFSEP).exists
+
+$(INST_MAN1DIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_MAN1DIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN1DIR)
+	$(NOECHO) $(TOUCH) $(INST_MAN1DIR)$(DFSEP).exists
+
+$(INST_MAN3DIR)$(DFSEP).exists :: Makefile.PL
+	$(NOECHO) $(MKPATH) $(INST_MAN3DIR)
+	$(NOECHO) $(CHMOD) $(PERM_DIR) $(INST_MAN3DIR)
+	$(NOECHO) $(TOUCH) $(INST_MAN3DIR)$(DFSEP).exists
+
+
+
+# --- MakeMaker linkext section:
+
+linkext :: $(LINKTYPE)
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker dlsyms section:
+
+
+# --- MakeMaker dynamic_bs section:
+
+BOOTSTRAP =
+
+
+# --- MakeMaker dynamic section:
+
+dynamic :: $(FIRST_MAKEFILE) $(BOOTSTRAP) $(INST_DYNAMIC)
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker dynamic_lib section:
+
+
+# --- MakeMaker static section:
+
+## $(INST_PM) has been moved to the all: target.
+## It remains here for awhile to allow for old usage: "make static"
+static :: $(FIRST_MAKEFILE) $(INST_STATIC)
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker static_lib section:
+
+
+# --- MakeMaker manifypods section:
+
+POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--"
+POD2MAN = $(POD2MAN_EXE)
+
+
+manifypods : pure_all  \
+	lib/PascalAs/PascalAs.pm
+	$(NOECHO) $(POD2MAN) --section=$(MAN3EXT) --perm_rw=$(PERM_RW) -u \
+	  lib/PascalAs/PascalAs.pm $(INST_MAN3DIR)/PascalAs::PascalAs.$(MAN3EXT) 
+
+
+
+
+# --- MakeMaker processPL section:
+
+
+# --- MakeMaker installbin section:
+
+EXE_FILES = bin/pascalas.pl
+
+pure_all :: $(INST_SCRIPT)/pascalas.pl
+	$(NOECHO) $(NOOP)
+
+realclean ::
+	$(RM_F) \
+	  $(INST_SCRIPT)/pascalas.pl 
+
+$(INST_SCRIPT)/pascalas.pl : bin/pascalas.pl $(FIRST_MAKEFILE) $(INST_SCRIPT)$(DFSEP).exists $(INST_BIN)$(DFSEP).exists
+	$(NOECHO) $(RM_F) $(INST_SCRIPT)/pascalas.pl
+	$(CP) bin/pascalas.pl $(INST_SCRIPT)/pascalas.pl
+	$(FIXIN) $(INST_SCRIPT)/pascalas.pl
+	-$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_SCRIPT)/pascalas.pl
+
+
+
+# --- MakeMaker subdirs section:
+
+# none
+
+# --- MakeMaker clean_subdirs section:
+clean_subdirs :
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker clean section:
+
+# Delete temporary files but do not touch installed files. We don't delete
+# the Makefile here so a later make realclean still has a makefile to use.
+
+clean :: clean_subdirs
+	- $(RM_F) \
+	  $(BASEEXT).bso $(BASEEXT).def \
+	  $(BASEEXT).exp $(BASEEXT).x \
+	  $(BOOTSTRAP) $(INST_ARCHAUTODIR)/extralibs.all \
+	  $(INST_ARCHAUTODIR)/extralibs.ld $(MAKE_APERL_FILE) \
+	  *$(LIB_EXT) *$(OBJ_EXT) \
+	  *perl.core MYMETA.json \
+	  MYMETA.yml blibdirs.ts \
+	  core core.*perl.*.? \
+	  core.[0-9] core.[0-9][0-9] \
+	  core.[0-9][0-9][0-9] core.[0-9][0-9][0-9][0-9] \
+	  core.[0-9][0-9][0-9][0-9][0-9] lib$(BASEEXT).def \
+	  mon.out perl \
+	  perl$(EXE_EXT) perl.exe \
+	  perlmain.c pm_to_blib \
+	  pm_to_blib.ts so_locations \
+	  tmon.out 
+	- $(RM_RF) \
+	  blib 
+	  $(NOECHO) $(RM_F) $(MAKEFILE_OLD)
+	- $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL)
+
+
+# --- MakeMaker realclean_subdirs section:
+realclean_subdirs :
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker realclean section:
+# Delete temporary files (via clean) and also delete dist files
+realclean purge ::  clean realclean_subdirs
+	- $(RM_F) \
+	  $(FIRST_MAKEFILE) $(MAKEFILE_OLD) 
+	- $(RM_RF) \
+	  $(DISTVNAME) 
+
+
+# --- MakeMaker metafile section:
+metafile : create_distdir
+	$(NOECHO) $(ECHO) Generating META.yml
+	$(NOECHO) $(ECHO) '---' > META_new.yml
+	$(NOECHO) $(ECHO) 'abstract: '\''Assembler for NVIDIA Maxwell architecture'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'author:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  - '\''Scott Gray <sgray@nervanasys.com>'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'build_requires:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  ExtUtils::MakeMaker: '\''0'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'configure_requires:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  ExtUtils::MakeMaker: '\''0'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'dynamic_config: 1' >> META_new.yml
+	$(NOECHO) $(ECHO) 'generated_by: '\''ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'license: mit' >> META_new.yml
+	$(NOECHO) $(ECHO) 'meta-spec:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  url: http://module-build.sourceforge.net/META-spec-v1.4.html' >> META_new.yml
+	$(NOECHO) $(ECHO) '  version: '\''1.4'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'name: PascalAs-PascalAs' >> META_new.yml
+	$(NOECHO) $(ECHO) 'no_index:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  directory:' >> META_new.yml
+	$(NOECHO) $(ECHO) '    - t' >> META_new.yml
+	$(NOECHO) $(ECHO) '    - inc' >> META_new.yml
+	$(NOECHO) $(ECHO) 'requires:' >> META_new.yml
+	$(NOECHO) $(ECHO) '  Carp: '\''1.29'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) '  Data::Dumper: '\''2.145'\''' >> META_new.yml
+	$(NOECHO) $(ECHO) 'version: '\''1.06'\''' >> META_new.yml
+	-$(NOECHO) $(MV) META_new.yml $(DISTVNAME)/META.yml
+	$(NOECHO) $(ECHO) Generating META.json
+	$(NOECHO) $(ECHO) '{' > META_new.json
+	$(NOECHO) $(ECHO) '   "abstract" : "Assembler for NVIDIA Maxwell architecture",' >> META_new.json
+	$(NOECHO) $(ECHO) '   "author" : [' >> META_new.json
+	$(NOECHO) $(ECHO) '      "Scott Gray <sgray@nervanasys.com>"' >> META_new.json
+	$(NOECHO) $(ECHO) '   ],' >> META_new.json
+	$(NOECHO) $(ECHO) '   "dynamic_config" : 1,' >> META_new.json
+	$(NOECHO) $(ECHO) '   "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001",' >> META_new.json
+	$(NOECHO) $(ECHO) '   "license" : [' >> META_new.json
+	$(NOECHO) $(ECHO) '      "mit"' >> META_new.json
+	$(NOECHO) $(ECHO) '   ],' >> META_new.json
+	$(NOECHO) $(ECHO) '   "meta-spec" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",' >> META_new.json
+	$(NOECHO) $(ECHO) '      "version" : "2"' >> META_new.json
+	$(NOECHO) $(ECHO) '   },' >> META_new.json
+	$(NOECHO) $(ECHO) '   "name" : "PascalAs-PascalAs",' >> META_new.json
+	$(NOECHO) $(ECHO) '   "no_index" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '      "directory" : [' >> META_new.json
+	$(NOECHO) $(ECHO) '         "t",' >> META_new.json
+	$(NOECHO) $(ECHO) '         "inc"' >> META_new.json
+	$(NOECHO) $(ECHO) '      ]' >> META_new.json
+	$(NOECHO) $(ECHO) '   },' >> META_new.json
+	$(NOECHO) $(ECHO) '   "prereqs" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '      "build" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '         "requires" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '            "ExtUtils::MakeMaker" : "0"' >> META_new.json
+	$(NOECHO) $(ECHO) '         }' >> META_new.json
+	$(NOECHO) $(ECHO) '      },' >> META_new.json
+	$(NOECHO) $(ECHO) '      "configure" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '         "requires" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '            "ExtUtils::MakeMaker" : "0"' >> META_new.json
+	$(NOECHO) $(ECHO) '         }' >> META_new.json
+	$(NOECHO) $(ECHO) '      },' >> META_new.json
+	$(NOECHO) $(ECHO) '      "runtime" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '         "requires" : {' >> META_new.json
+	$(NOECHO) $(ECHO) '            "Carp" : "1.29",' >> META_new.json
+	$(NOECHO) $(ECHO) '            "Data::Dumper" : "2.145"' >> META_new.json
+	$(NOECHO) $(ECHO) '         }' >> META_new.json
+	$(NOECHO) $(ECHO) '      }' >> META_new.json
+	$(NOECHO) $(ECHO) '   },' >> META_new.json
+	$(NOECHO) $(ECHO) '   "release_status" : "stable",' >> META_new.json
+	$(NOECHO) $(ECHO) '   "version" : "1.06"' >> META_new.json
+	$(NOECHO) $(ECHO) '}' >> META_new.json
+	-$(NOECHO) $(MV) META_new.json $(DISTVNAME)/META.json
+
+
+# --- MakeMaker signature section:
+signature :
+	cpansign -s
+
+
+# --- MakeMaker dist_basics section:
+distclean :: realclean distcheck
+	$(NOECHO) $(NOOP)
+
+distcheck :
+	$(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck
+
+skipcheck :
+	$(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck
+
+manifest :
+	$(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest
+
+veryclean : realclean
+	$(RM_F) *~ */*~ *.orig */*.orig *.bak */*.bak *.old */*.old
+
+
+
+# --- MakeMaker dist_core section:
+
+dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE)
+	$(NOECHO) $(ABSPERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \
+	  -e '    if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' --
+
+tardist : $(DISTVNAME).tar$(SUFFIX)
+	$(NOECHO) $(NOOP)
+
+uutardist : $(DISTVNAME).tar$(SUFFIX)
+	uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu
+	$(NOECHO) $(ECHO) 'Created $(DISTVNAME).tar$(SUFFIX)_uu'
+
+$(DISTVNAME).tar$(SUFFIX) : distdir
+	$(PREOP)
+	$(TO_UNIX)
+	$(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME)
+	$(RM_RF) $(DISTVNAME)
+	$(COMPRESS) $(DISTVNAME).tar
+	$(NOECHO) $(ECHO) 'Created $(DISTVNAME).tar$(SUFFIX)'
+	$(POSTOP)
+
+zipdist : $(DISTVNAME).zip
+	$(NOECHO) $(NOOP)
+
+$(DISTVNAME).zip : distdir
+	$(PREOP)
+	$(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME)
+	$(RM_RF) $(DISTVNAME)
+	$(NOECHO) $(ECHO) 'Created $(DISTVNAME).zip'
+	$(POSTOP)
+
+shdist : distdir
+	$(PREOP)
+	$(SHAR) $(DISTVNAME) > $(DISTVNAME).shar
+	$(RM_RF) $(DISTVNAME)
+	$(NOECHO) $(ECHO) 'Created $(DISTVNAME).shar'
+	$(POSTOP)
+
+
+# --- MakeMaker distdir section:
+create_distdir :
+	$(RM_RF) $(DISTVNAME)
+	$(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \
+		-e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');"
+
+distdir : create_distdir distmeta 
+	$(NOECHO) $(NOOP)
+
+
+
+# --- MakeMaker dist_test section:
+disttest : distdir
+	cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL 
+	cd $(DISTVNAME) && $(MAKE) $(PASTHRU)
+	cd $(DISTVNAME) && $(MAKE) test $(PASTHRU)
+
+
+
+# --- MakeMaker dist_ci section:
+
+ci :
+	$(PERLRUN) "-MExtUtils::Manifest=maniread" \
+	  -e "@all = keys %{ maniread() };" \
+	  -e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \
+	  -e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});"
+
+
+# --- MakeMaker distmeta section:
+distmeta : create_distdir metafile
+	$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'exit unless -e q{META.yml};' \
+	  -e 'eval { maniadd({q{META.yml} => q{Module YAML meta-data (added by MakeMaker)}}) }' \
+	  -e '    or print "Could not add META.yml to MANIFEST: $$$${'\''@'\''}\n"' --
+	$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'exit unless -f q{META.json};' \
+	  -e 'eval { maniadd({q{META.json} => q{Module JSON meta-data (added by MakeMaker)}}) }' \
+	  -e '    or print "Could not add META.json to MANIFEST: $$$${'\''@'\''}\n"' --
+
+
+
+# --- MakeMaker distsignature section:
+distsignature : create_distdir
+	$(NOECHO) cd $(DISTVNAME) && $(ABSPERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{SIGNATURE} => q{Public-key signature (added by MakeMaker)}}) }' \
+	  -e '    or print "Could not add SIGNATURE to MANIFEST: $$$${'\''@'\''}\n"' --
+	$(NOECHO) cd $(DISTVNAME) && $(TOUCH) SIGNATURE
+	cd $(DISTVNAME) && cpansign -s
+
+
+
+# --- MakeMaker install section:
+
+install :: pure_install doc_install
+	$(NOECHO) $(NOOP)
+
+install_perl :: pure_perl_install doc_perl_install
+	$(NOECHO) $(NOOP)
+
+install_site :: pure_site_install doc_site_install
+	$(NOECHO) $(NOOP)
+
+install_vendor :: pure_vendor_install doc_vendor_install
+	$(NOECHO) $(NOOP)
+
+pure_install :: pure_$(INSTALLDIRS)_install
+	$(NOECHO) $(NOOP)
+
+doc_install :: doc_$(INSTALLDIRS)_install
+	$(NOECHO) $(NOOP)
+
+pure__install : pure_site_install
+	$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
+
+doc__install : doc_site_install
+	$(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site
+
+pure_perl_install :: all
+	$(NOECHO) umask 022; $(MOD_INSTALL) \
+		"$(INST_LIB)" "$(DESTINSTALLPRIVLIB)" \
+		"$(INST_ARCHLIB)" "$(DESTINSTALLARCHLIB)" \
+		"$(INST_BIN)" "$(DESTINSTALLBIN)" \
+		"$(INST_SCRIPT)" "$(DESTINSTALLSCRIPT)" \
+		"$(INST_MAN1DIR)" "$(DESTINSTALLMAN1DIR)" \
+		"$(INST_MAN3DIR)" "$(DESTINSTALLMAN3DIR)"
+	$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
+		"$(SITEARCHEXP)/auto/$(FULLEXT)"
+
+
+pure_site_install :: all
+	$(NOECHO) umask 02; $(MOD_INSTALL) \
+		read "$(SITEARCHEXP)/auto/$(FULLEXT)/.packlist" \
+		write "$(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist" \
+		"$(INST_LIB)" "$(DESTINSTALLSITELIB)" \
+		"$(INST_ARCHLIB)" "$(DESTINSTALLSITEARCH)" \
+		"$(INST_BIN)" "$(DESTINSTALLSITEBIN)" \
+		"$(INST_SCRIPT)" "$(DESTINSTALLSITESCRIPT)" \
+		"$(INST_MAN1DIR)" "$(DESTINSTALLSITEMAN1DIR)" \
+		"$(INST_MAN3DIR)" "$(DESTINSTALLSITEMAN3DIR)"
+	$(NOECHO) $(WARN_IF_OLD_PACKLIST) \
+		"$(PERL_ARCHLIB)/auto/$(FULLEXT)"
+
+pure_vendor_install :: all
+	$(NOECHO) umask 022; $(MOD_INSTALL) \
+		"$(INST_LIB)" "$(DESTINSTALLVENDORLIB)" \
+		"$(INST_ARCHLIB)" "$(DESTINSTALLVENDORARCH)" \
+		"$(INST_BIN)" "$(DESTINSTALLVENDORBIN)" \
+		"$(INST_SCRIPT)" "$(DESTINSTALLVENDORSCRIPT)" \
+		"$(INST_MAN1DIR)" "$(DESTINSTALLVENDORMAN1DIR)" \
+		"$(INST_MAN3DIR)" "$(DESTINSTALLVENDORMAN3DIR)"
+
+
+doc_perl_install :: all
+
+doc_site_install :: all
+	$(NOECHO) $(ECHO) Appending installation info to "$(DESTINSTALLSITEARCH)/perllocal.pod"
+	-$(NOECHO) umask 02; $(MKPATH) "$(DESTINSTALLSITEARCH)"
+	-$(NOECHO) umask 02; $(DOC_INSTALL) \
+		"Module" "$(NAME)" \
+		"installed into" $(INSTALLSITELIB) \
+		LINKTYPE "$(LINKTYPE)" \
+		VERSION "$(VERSION)" \
+		EXE_FILES "$(EXE_FILES)" \
+		>> "$(DESTINSTALLSITEARCH)/perllocal.pod"
+
+doc_vendor_install :: all
+
+
+uninstall :: uninstall_from_$(INSTALLDIRS)dirs
+	$(NOECHO) $(NOOP)
+
+uninstall_from_perldirs ::
+
+uninstall_from_sitedirs ::
+	$(NOECHO) $(UNINSTALL) "$(SITEARCHEXP)/auto/$(FULLEXT)/.packlist"
+
+uninstall_from_vendordirs ::
+
+
+# --- MakeMaker force section:
+# Phony target to force checking subdirectories.
+FORCE :
+	$(NOECHO) $(NOOP)
+
+
+# --- MakeMaker perldepend section:
+
+
+# --- MakeMaker makefile section:
+# We take a very conservative approach here, but it's worth it.
+# We move Makefile to Makefile.old here to avoid gnu make looping.
+$(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP)
+	$(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?"
+	$(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..."
+	-$(NOECHO) $(RM_F) $(MAKEFILE_OLD)
+	-$(NOECHO) $(MV)   $(FIRST_MAKEFILE) $(MAKEFILE_OLD)
+	- $(MAKE) $(USEMAKEFILE) $(MAKEFILE_OLD) clean $(DEV_NULL)
+	$(PERLRUN) Makefile.PL 
+	$(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <=="
+	$(NOECHO) $(ECHO) "==> Please rerun the $(MAKE) command.  <=="
+	$(FALSE)
+
+
+
+# --- MakeMaker staticmake section:
+
+# --- MakeMaker makeaperl section ---
+MAP_TARGET    = perl
+FULLPERL      = "/usr/bin/perl"
+
+$(MAP_TARGET) :: static $(MAKE_APERL_FILE)
+	$(MAKE) $(USEMAKEFILE) $(MAKE_APERL_FILE) $@
+
+$(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) pm_to_blib
+	$(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET)
+	$(NOECHO) $(PERLRUNINST) \
+		Makefile.PL DIR="" \
+		MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \
+		MAKEAPERL=1 NORECURS=1 CCCDLFLAGS=
+
+
+# --- MakeMaker test section:
+
+TEST_VERBOSE=0
+TEST_TYPE=test_$(LINKTYPE)
+TEST_FILE = test.pl
+TEST_FILES = t/*.t
+TESTDB_SW = -d
+
+testdb :: testdb_$(LINKTYPE)
+
+test :: $(TEST_TYPE) subdirs-test
+
+subdirs-test ::
+	$(NOECHO) $(NOOP)
+
+
+test_dynamic :: pure_all
+	PERL_DL_NONLAZY=1 $(FULLPERLRUN) "-MExtUtils::Command::MM" "-MTest::Harness" "-e" "undef *Test::Harness::Switches; test_harness($(TEST_VERBOSE), '$(INST_LIB)', '$(INST_ARCHLIB)')" $(TEST_FILES)
+
+testdb_dynamic :: pure_all
+	PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE)
+
+test_ : test_dynamic
+
+test_static :: test_dynamic
+testdb_static :: testdb_dynamic
+
+
+# --- MakeMaker ppd section:
+# Creates a PPD (Perl Package Description) for a binary distribution.
+ppd :
+	$(NOECHO) $(ECHO) '<SOFTPKG NAME="$(DISTNAME)" VERSION="$(VERSION)">' > $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    <ABSTRACT>Assembler for NVIDIA Maxwell architecture</ABSTRACT>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    <AUTHOR>Scott Gray &lt;sgray@nervanasys.com&gt;</AUTHOR>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    <IMPLEMENTATION>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <REQUIRE NAME="Carp::" VERSION="1.29" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <REQUIRE NAME="Data::Dumper" VERSION="2.145" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <ARCHITECTURE NAME="x86_64-linux-gnu-thread-multi-5.22" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '        <CODEBASE HREF="" />' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '    </IMPLEMENTATION>' >> $(DISTNAME).ppd
+	$(NOECHO) $(ECHO) '</SOFTPKG>' >> $(DISTNAME).ppd
+
+
+# --- MakeMaker pm_to_blib section:
+
+pm_to_blib : $(FIRST_MAKEFILE) $(TO_INST_PM)
+	$(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', q[$(PM_FILTER)], '\''$(PERM_DIR)'\'')' -- \
+	  lib/PascalAs/Cubin.pm blib/lib/PascalAs/Cubin.pm \
+	  lib/PascalAs/PascalAs.pm blib/lib/PascalAs/PascalAs.pm \
+	  lib/PascalAs/PascalAsGrammar.pm blib/lib/PascalAs/PascalAsGrammar.pm 
+	$(NOECHO) $(TOUCH) pm_to_blib
+
+
+# --- MakeMaker selfdocument section:
+
+
+# --- MakeMaker postamble section:
+
+
+# End.
diff --git a/Assembler/PascalAs/Makefile.PL b/Assembler/PascalAs/Makefile.PL
new file mode 100644
index 0000000..6acdeda
--- /dev/null
+++ b/Assembler/PascalAs/Makefile.PL
@@ -0,0 +1,14 @@
+require 5.10.0;
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+    NAME              => 'PascalAs::PascalAs',
+    VERSION_FROM      => 'lib/PascalAs/PascalAs.pm', # finds $VERSION
+    EXE_FILES         => ['bin/pascalas.pl'],
+    PREREQ_PM         => {Carp => 1.29, Data::Dumper => 2.145},
+    LICENSE           => 'MIT',
+    ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
+      (ABSTRACT_FROM  => 'lib/PascalAs/PascalAs.pm', # retrieve abstract from module
+       AUTHOR         => 'Scott Gray <sgray@nervanasys.com>') : ()),
+);
diff --git a/Assembler/PascalAs/README.md b/Assembler/PascalAs/README.md
new file mode 100644
index 0000000..318aba8
--- /dev/null
+++ b/Assembler/PascalAs/README.md
@@ -0,0 +1,28 @@
+# MaxAs
+Assembler for NVIDIA Maxwell architecture
+
+To install (system-wide):
+
+    sudo cpanm git://github.com/NervanaSystems/maxas.git
+
+or
+
+    perl Makefile.PL
+    make
+    sudo make install
+
+
+See wiki pages for more information:
+
+- [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction)
+- [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started)
+- [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes)
+- [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM)
+
+Related work with lots of additional shader assembly (sass) examples:
+
+- [NervanaGPU](https://github.com/NervanaSystems/nervanagpu)
+
+This project is released under the [MIT License](http://opensource.org/licenses/MIT).
+
+-- Scott Gray
diff --git a/Assembler/PascalAs/bin/pascalas.pl b/Assembler/PascalAs/bin/pascalas.pl
new file mode 100755
index 0000000..a0f1372
--- /dev/null
+++ b/Assembler/PascalAs/bin/pascalas.pl
@@ -0,0 +1,286 @@
+#!/usr/bin/perl
+use strict;
+use PascalAs::Cubin;
+use PascalAs::PascalAs;
+use Data::Dumper;
+use File::Spec;
+
+require 5.10.0;
+
+$Data::Dumper::Sortkeys = 1;
+
+my $mode = shift;
+
+# List cubin contents
+if ($mode =~ /^\-?\-l/i)
+{
+    my $cubinFile = shift or usage();
+
+    my $cubin = PascalAs::Cubin->new($cubinFile);
+
+    my $arch    = $cubin->arch;
+    my $class   = $cubin->class;
+    my $asize   = $cubin->address_size;
+    my $kernels = $cubin->listKernels;
+    my $symbols = $cubin->listSymbols;
+
+    printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize;
+
+    foreach my $ker (sort keys %$kernels)
+    {
+        printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)};
+    }
+    foreach my $sym (sort keys %$symbols)
+    {
+        printf "Symbol: %s\n", $sym;
+    }
+}
+# Test that the assembler can reproduce the op codes this cubin or sass contains
+elsif ($mode =~ /^\-?\-t/i)
+{
+    my $reg  = shift if $ARGV[0] =~ /^\-?\-r/i;
+    my $all  = shift if $ARGV[0] =~ /^\-?\-a/i;
+    my $file = shift or usage();
+    my $fh;
+    # sass file
+    if (-T $file)
+    {
+        open $fh, $file or die "$file: $!";
+    }
+    # cubin file
+    else
+    {
+        my $cubin = PascalAs::Cubin->new($file);
+        my $arch  = $cubin->arch;
+
+        open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!";
+        my $first = <$fh>;
+        if ($first =~ /cuobjdump fatal/)
+        {
+            print $first;
+            exit(1);
+        }
+    }
+    exit(PascalAs::PascalAs::Test($fh, $reg, $all) ? 1 : 0);
+}
+# Extract an asm file containing the desired kernel
+elsif ($mode =~ /^\-?\-e/i)
+{
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $cubinFile = shift or usage();
+    my $asmFile   = shift;
+    my $cubin     = PascalAs::Cubin->new($cubinFile);
+    my $arch      = $cubin->arch;
+    my $kernels   = $cubin->listKernels;
+
+    #default the kernel name if not specified.
+    $kernelName ||= (sort keys %$kernels)[0];
+
+    my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName";
+
+    open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_60 -sass -fun $kernelName $cubinFile: $!";
+    my $first = <$in>;
+    if ($first =~ /cuobjdump fatal/)
+    {
+        print $first;
+        exit(1);
+    }
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n";
+
+    print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt));
+
+    print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n";
+
+    print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params};
+
+    print $out "#\n# Instructions:\n\n";
+
+    PascalAs::PascalAs::Extract($in, $out, $kernel->{Params});
+
+    close $out if $asmFile;
+    close $in;
+}
+# Extract a kernel from a sass dump
+elsif ($mode =~ /^\-?\-s/i)
+{
+    my $sassFile  = shift or usage();
+    my $asmFile   = shift;
+
+    open my $in, $sassFile or die "$sassFile: $!";
+
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    PascalAs::PascalAs::Extract($in, $out, []);
+
+    close $out if $asmFile;
+    close $in;
+}
+# Insert the kernel asm back into the cubin:
+elsif ($mode =~ /^\-?\-i/i)
+{
+    my $nowarn;
+    if ($ARGV[0] =~ /^\-?\-w/i)
+    {
+        $nowarn = shift;
+    }
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $noReuse   = shift if $ARGV[0] =~ /^\-?\-n/i;
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package PascalAs::PascalAs::CODE; our \$$name = '$value';"
+    }
+
+    my $asmFile   = shift or usage();
+    my $cubinFile = shift or usage();
+    my $newCubin  = shift || $cubinFile;
+
+    my $file;
+    if (open my $fh, $asmFile)
+    {
+        local $/;
+        $file = <$fh>;
+        close $fh;
+    }
+    else { die "$asmFile: $!" }
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    # extract the kernel name from the file
+    ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName;
+    die "asm file missing kernel name or is badly formatted" unless $kernelName;
+
+    my $kernel = PascalAs::PascalAs::Assemble($file, $include, !$noReuse, $nowarn);
+
+    my $cubin  = PascalAs::Cubin->new($cubinFile);
+    $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName";
+
+    $cubin->modifyKernel(%$kernel);
+
+    $cubin->write($newCubin);
+
+    printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n",
+        @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)};
+
+}
+# Preprocessing:
+elsif ($mode =~ /^\-?\-p/i)
+{
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package PascalAs::PascalAs::CODE; our \$$name = '$value';";
+    }
+    my $debug     = shift if $ARGV[0] =~ /^\-?\-d/i;
+    my $asmFile   = shift or usage();
+    my $asmFile2  = shift;
+
+    die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2;
+
+    open my $fh,  $asmFile or die "$asmFile: $!";
+    local $/;
+    my $file = <$fh>;
+    close $fh;
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    if ($asmFile2)
+    {
+        open $fh, ">$asmFile2" or die "$asmFile2: $!";
+    }
+    else
+    {
+        $fh = \*STDOUT;
+    }
+    print $fh PascalAs::PascalAs::Preprocess($file, $include, $debug);
+    close $fh;
+}
+# get version information
+elsif ($mode =~ /^\-?\-v/i)
+{
+    print "$PascalAs::PascalAs::VERSION\n";
+}
+else
+{
+    print "$mode\n";
+    usage();
+}
+
+exit(0);
+
+
+
+sub usage
+{
+    print <<EOF;
+Usage:
+
+  List kernels and symbols:
+
+    pascalas.pl --list|-l <cubin_file>
+
+  Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes.
+  Also useful for extending the missing grammar rules.  Defaults to only showing failures without --all.
+  With the --reg flag it will show register bank conflicts not hidden by reuse flags.
+
+    pascalas.pl --test|-t [--reg|-r] [--all|-a] <cubin_file | cuobjdump_sass_file>
+
+  Extract a single kernel into an asm file from a cubin.
+  Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin.
+
+    pascalas.pl --extract|-e [--kernel|-k kernel_name] <cubin_file> [asm_file]
+
+  Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes.
+  Include the debug flag to print out detailed scheduler info.
+
+    pascalas.pl --pre|-p [--debug|-d] <asm_file> [new_asm_file]
+
+  Insert the kernel asm back into the cubin.  Overwrite existing or create new cubin.
+  Optionally you can skip register reuse flag auto insertion.  This allows you to observe
+  performance without any reuse or you can use it to set the flags manually in your sass.
+
+    pascalas.pl --insert|-i [--noreuse|-n] <asm_file> <cubin_file> [new_cubin_file]
+
+  Display version information and exit:
+
+    pascalas.pl --version|-v
+
+EOF
+    exit(1);
+}
+
+__END__
diff --git a/Assembler/PascalAs/blib/arch/.exists b/Assembler/PascalAs/blib/arch/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/arch/auto/MaxAs/MaxAs/.exists b/Assembler/PascalAs/blib/arch/auto/MaxAs/MaxAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/arch/auto/PascalAs/PascalAs/.exists b/Assembler/PascalAs/blib/arch/auto/PascalAs/PascalAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/bin/.exists b/Assembler/PascalAs/blib/bin/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/lib/MaxAs/.exists b/Assembler/PascalAs/blib/lib/MaxAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/lib/MaxAs/Cubin.pm b/Assembler/PascalAs/blib/lib/MaxAs/Cubin.pm
new file mode 100644
index 0000000..5900958
--- /dev/null
+++ b/Assembler/PascalAs/blib/lib/MaxAs/Cubin.pm
@@ -0,0 +1,684 @@
+package MaxAs::Cubin;
+
+use strict;
+use Data::Dumper;
+
+my @Elf32_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    L   entry
+    L   phOffset
+    L   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf64_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    Q   entry
+    Q   phOffset
+    Q   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf32_PrgHdr = qw(
+    L   type
+    L   offset
+    L   vaddr
+    L   paddr
+    L   fileSize
+    L   memSize
+    L   flags
+    L   align
+);
+my @Elf64_PrgHdr = qw(
+    L   type
+    L   flags
+    Q   offset
+    Q   vaddr
+    Q   paddr
+    Q   fileSize
+    Q   memSize
+    Q   align
+);
+my @Elf32_SecHdr = qw(
+    L   name
+    L   type
+    L   flags
+    L   addr
+    L   offset
+    L   size
+    L   link
+    L   info
+    L   align
+    L   entSize
+);
+my @Elf64_SecHdr = qw(
+    L   name
+    L   type
+    Q   flags
+    Q   addr
+    Q   offset
+    Q   size
+    L   link
+    L   info
+    Q   align
+    Q   entSize
+);
+my @Elf32_SymEnt = qw(
+    L   name
+    L   value
+    L   size
+    C   info
+    C   other
+    S   shIndx
+);
+my @Elf64_SymEnt = qw(
+    L   name
+    C   info
+    C   other
+    S   shIndx
+    Q   value
+    Q   size
+);
+my @symBind = qw(LOCAL GLOBAL WEAK);
+
+# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C)
+my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC);
+
+$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr;
+$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr;
+$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr;
+$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt;
+
+$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr;
+$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr;
+$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr;
+$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt;
+
+$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr    ];
+$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ];
+$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ];
+$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ];
+
+$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr    ];
+$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ];
+$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ];
+$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ];
+
+# Load a cubin ELF file
+sub new
+{
+    my ($package, $file) = @_;
+
+    my $cubin = bless { fileName => $file }, $package;
+
+    open my $fh, $file or die "$file: $!";
+    binmode($fh);
+
+    # Read in assuming 32 bit header
+    my $data;
+    read $fh, $data, 0x34;
+    my $elfHdr = $cubin->{elfHdr} = {};
+    @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data;
+
+    # 1: 32bit, 2: 64bit
+    my $class = $elfHdr->{fileClass};
+
+    # re-read in with 64 bit header if needed
+    if ($class == 2)
+    {
+        seek $fh, 0, 0;
+        read $fh, $data, 0x46;
+        @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data;
+
+        $cubin->{Class} = 64;
+    }
+    else
+    {
+        $cubin->{Class} = 32;
+    }
+
+    # verify sm_50 cubin
+    $cubin->{Arch} = $elfHdr->{flags} & 0xFF;
+    die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50;
+
+    $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32;
+
+    # Read in Program Headers
+    seek $fh, $elfHdr->{phOffset}, 0;
+    foreach (1 .. $elfHdr->{phNum})
+    {
+        read $fh, $data, $elfHdr->{phEntSize};
+
+        my %prgHdr = (Indx => $_ - 1);
+        @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data;
+        push @{$cubin->{prgHdrs}}, \%prgHdr;
+    }
+
+    # Read in Section Headers
+    seek $fh, $elfHdr->{shOffset}, 0;
+    foreach (1 .. $elfHdr->{shNum})
+    {
+        read $fh, $data, $elfHdr->{shEntSize};
+
+        my %secHdr = (Indx => $_ - 1);
+        @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data;
+        push @{$cubin->{secHdrs}}, \%secHdr;
+    }
+
+    # Read in Section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $data = '';
+        # Skip sections with no data (type NULL or NOBITS)
+        if ($secHdr->{size} && $secHdr->{type} != 8)
+        {
+            seek $fh, $secHdr->{offset}, 0;
+            read $fh, $data, $secHdr->{size};
+        }
+        # Convert string tables to maps
+        if ($secHdr->{type} == 3) # STRTAB
+        {
+            my $strTab = $secHdr->{StrTab} = {};
+            my $indx   = 0;
+            foreach my $str (split "\0", $data)
+            {
+                $strTab->{$indx} = $str;
+                $indx += 1 + length($str);
+            }
+        }
+        # Read in Symbol data
+        if ($secHdr->{type} == 2) # SYMTAB
+        {
+            my $offset = 0;
+            while ($offset < $secHdr->{size})
+            {
+                my $symEnt = {};
+                @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize});
+                $offset += $secHdr->{entSize};
+
+                push @{$secHdr->{SymTab}}, $symEnt;
+            }
+        }
+        # Cache raw data for further processing and writing
+        $secHdr->{Data} = unpack 'H*', $data;
+    }
+    close $fh;
+
+    # Update section headers with their names.  Map names directly to headers.
+    my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab};
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $secHdr->{Name} = $shStrTab->{$secHdr->{name}};
+        $cubin->{$secHdr->{Name}} = $secHdr;
+    }
+
+    # Update symbols with their names
+    # For the Global functions, extract kernel meta data
+    # Populate the kernel hash
+    my $strTab = $cubin->{'.strtab'}{StrTab};
+    foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}})
+    {
+        $symEnt->{Name} = $strTab->{$symEnt->{name}};
+
+        # Attach symbol to section
+        my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}];
+        $secHdr->{SymbolEnt} = $symEnt;
+
+        # Look for symbols tagged FUNC
+        if (($symEnt->{info} & 0x0f) == 0x02)
+        {
+            # Create a hash of kernels for output
+            my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr;
+
+            # Extract local/global/weak binding info
+            $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4];
+
+            # Extract the kernel instructions
+            $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ];
+
+            # Extract the max barrier resource identifier used and add 1. Should be 0-16.
+            # If a register is used as a barrier resource id, then this value is the max of 16.
+            $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20;
+
+            # Extract the number of allocated registers for this kernel.
+            $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24;
+
+            # Extract the size of shared memory this kernel uses.
+            my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"};
+            $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0;
+
+            # Attach constant0 section
+            $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"};
+
+            # Extract the kernel parameter data.
+            my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"};
+            if ($paramSec)
+            {
+                # Extract raw param data
+                my @data = unpack "L*", pack "H*", $paramSec->{Data};
+
+                $paramSec->{ParamData} = \@data;
+                $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ];
+
+                # Find the first param delimiter
+                my $idx = 0;
+                $idx++ while $idx < @data && $data[$idx] != 0x00080a04;
+
+                my $first = $data[$idx+2] & 0xFFFF;
+                #my $size  = $data[$idx+2] >> 16;
+                $idx += 4;
+
+                my @params;
+                while ($idx < @data && $data[$idx] == 0x000c1704)
+                {
+                    # Get the ordinal, offset, size and pointer alignment for each param
+                    my $ord    = $data[$idx+2] & 0xFFFF;
+                    my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16);
+                    my $psize  = $data[$idx+3] >> 18;
+                    my $align  = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0;
+                    unshift @params, "$ord:$offset:$psize:$align";
+                    $idx += 4;
+                }
+                my @staticParams = @data[0 .. ($idx-1)];
+
+                my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize);
+                while ($idx < @data)
+                {
+                    my $code = $data[$idx] & 0xffff;
+                    my $size = $data[$idx] >> 16;
+                    $idx++;
+
+                    # EIATTR_MAXREG_COUNT
+                    if ($code == 0x1b03)
+                    {
+                        $maxregCount = $size;
+                    }
+                    # EIATTR_S2RCTAID_INSTR_OFFSETS
+                    elsif ($code == 0x1d04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @ctaidOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_EXIT_INSTR_OFFSETS
+                    elsif ($code == 0x1c04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @exitOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CTAIDZ_USED
+                    elsif ($code == 0x0401)
+                    {
+                        $ctaidzUsed = 1;
+                    }
+                    # EIATTR_REQNTID
+                    elsif ($code == 0x1004)
+                    {
+                        while ($size > 0)
+                        {
+                            push @reqntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_MAX_THREADS
+                    elsif ($code == 0x0504)
+                    {
+                        while ($size > 0)
+                        {
+                            push @maxntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CRS_STACK_SIZE
+                    elsif ($code == 0x1e04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @stackSize, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    else
+                    {
+                        printf STDERR "Unknown Code 0x%02x (size:%d)\n", $code, $size;
+                    }
+                }
+                $kernelSec->{Params}   = \@params;
+                $kernelSec->{ParamCnt} = scalar @params;
+
+                $paramSec->{StaticParams} = \@staticParams;
+                $paramSec->{MAXREG_COUNT} = $maxregCount;
+                $paramSec->{ExitOffsets}  = \@exitOffsets;
+                $paramSec->{CTAIDOffsets} = \@ctaidOffsets;
+                $paramSec->{CTAIDZUsed}   = $ctaidzUsed;
+                $paramSec->{REQNTID}      = \@reqntid;
+                $paramSec->{MAXNTID}      = \@maxntid;
+                $paramSec->{STACKSIZE}    = \@stackSize;
+            }
+            # print Dumper($paramSec);
+            # exit();
+        }
+        # Note GLOBALs found in this cubin
+        elsif (($symEnt->{info} & 0x10) == 0x10)
+        {
+            $cubin->{Symbols}{$symEnt->{Name}} = $symEnt;
+        }
+    }
+
+    # print "phOffset: $elfHdr->{phOffset}\n";
+    # print "shOffset: $elfHdr->{shOffset}\n";
+    # foreach my $secHdr (@{$cubin->{secHdrs}})
+    # {
+    #     print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n";
+    # }
+    # my $p = 0;
+    # foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    # {
+    #     print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n";
+    #     $p++;
+    # }
+    # exit();
+
+    # print Dumper($cubin->{prgHdrs});
+    # exit();
+    return $cubin;
+}
+sub class
+{
+    return shift()->{Class};
+}
+sub arch
+{
+    return shift()->{Arch};
+}
+sub address_size
+{
+    return shift()->{AddressSize};
+}
+sub listKernels
+{
+    return shift()->{Kernels};
+}
+sub listSymbols
+{
+    return shift()->{Symbols};
+}
+sub getKernel
+{
+    my ($cubin, $kernel) = @_;
+    return $cubin->{Kernels}{$kernel};
+}
+
+sub modifyKernel
+{
+    my ($cubin, %params) = @_;
+
+    my $kernelSec    = $params{Kernel};
+    my $newReg       = $params{RegCnt};
+    my $newBar       = $params{BarCnt};
+    my $exitOffsets  = $params{ExitOffsets};
+    my $ctaidOffsets = $params{CTAIDOffsets};
+    my $ctaidzUsed   = $params{CTAIDZUsed};
+    my $newData      = $params{KernelData};
+    my $newSize      = @$newData * 8;
+
+    die "255 register max" if $newReg > 255;
+    die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63;
+    die "16 is max barrier count" if $newBar > 16;
+
+    my $paramSec    = $kernelSec->{ParamSec};
+    my $kernelName  = $kernelSec->{SymbolEnt}{Name};
+    my $maxregCount = $paramSec->{MAXREG_COUNT};
+    my $stackSize   = $paramSec->{STACKSIZE};
+
+    # update the kernel
+    $kernelSec->{KernelData} = $newData;
+    $kernelSec->{Data}       = unpack "H*", pack "Q*", @$newData;
+
+    if ($newReg != $kernelSec->{RegCnt})
+    {
+        print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n";
+        $kernelSec->{RegCnt} = $newReg;
+        $kernelSec->{info}  &= ~0xff000000;
+        $kernelSec->{info}  |= $newReg << 24;
+    }
+    if ($newBar != $kernelSec->{BarCnt})
+    {
+        print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n";
+        $kernelSec->{BarCnt} = $newBar;
+        $kernelSec->{flags} &= ~0x01f00000;
+        $kernelSec->{flags} |=  $newBar << 20;
+    }
+
+    my @paramData = @{$paramSec->{StaticParams}};
+
+    if (defined $maxregCount)
+    {
+        push @paramData, ($maxregCount << 16) | 0x1b03;
+    }
+
+    my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets;
+    my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}};
+
+    if ($newCTAIDs ne $oldCTAIDs)
+    {
+        print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n";
+    }
+    if (@$ctaidOffsets)
+    {
+        push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04;
+        push @paramData, @$ctaidOffsets;
+    }
+
+    my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets;
+    my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}};
+
+    if ($newExits ne $oldExits)
+    {
+        print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n";
+    }
+    if (@$exitOffsets)
+    {
+        push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04;
+        push @paramData, @$exitOffsets;
+    }
+
+    if ($ctaidzUsed != $paramSec->{CTAIDZUsed})
+    {
+        print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n";
+    }
+    if ($ctaidzUsed)
+    {
+        push @paramData, 0x0401;
+    }
+
+    if (@{$paramSec->{REQNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004;
+        push @paramData, @{$paramSec->{REQNTID}};
+    }
+    if (@{$paramSec->{MAXNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504;
+        push @paramData, @{$paramSec->{MAXNTID}};
+    }
+
+    if (@$stackSize)
+    {
+        push @paramData, (scalar(@$stackSize) << 18) | 0x1e04;
+        push @paramData, @$stackSize;
+    }
+
+    my $newParamSize  = scalar(@paramData)*4;
+    $paramSec->{Data} = unpack "H*", pack "L*", @paramData;
+    if ($newParamSize != $paramSec->{size})
+    {
+        print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n";
+        $cubin->updateSize($paramSec, $newParamSize);
+    }
+
+    if ($newSize != $kernelSec->{size})
+    {
+        print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n";
+        $cubin->updateSize($kernelSec, $newSize, 1);
+    }
+}
+
+sub updateSize
+{
+    my ($cubin, $sec, $newSize, $updatePrgSize) = @_;
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # update section header
+    my $delta = $newSize - $sec->{size};
+    $sec->{size} = $newSize;
+
+    # update symtab section
+    if ($sec->{SymbolEnt})
+    {
+        $sec->{SymbolEnt}{size} = $newSize;
+        my $symSection = $cubin->{'.symtab'};
+        $symSection->{Data} = '';
+        foreach my $symEnt (@{$symSection->{SymTab}})
+        {
+            $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}};
+        }
+    }
+
+    my $pos = $elfHdr->{ehSize};
+    my %sizeMap;
+
+    # update section header offsets
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # skip first header
+        next if $secHdr->{align} == 0;
+
+        # NOBITS data sections are size 0
+        my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size};
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pos += $secHdr->{align} - $pad;
+        }
+        # map old offset to new
+        $sizeMap{$secHdr->{offset}} = $pos;
+
+        # update offset
+        $secHdr->{offset} = $pos;
+
+        # advance position by size
+        $pos += $size;
+    }
+
+    # compute total section header size
+    my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset};
+
+    # map old offset to new
+    $sizeMap{$elfHdr->{shOffset}} = $pos;
+    $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize;
+
+    $elfHdr->{shOffset} = $pos;
+    $elfHdr->{phOffset} = $pos + $shSize;
+
+    # update program header offsets and sizes
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        # Not sure how best to adjust these so just assume they'll track other offsets.
+        $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}};
+
+        # If the kernel sizes changes, also update the associated ProgramHeader.
+        # Note that this size is the kernel size plus any constant section sizes.
+        if ($updatePrgSize && $prgHdr->{type} == 1 &&
+            $sec->{offset} >= $prgHdr->{offset} &&
+            $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta)
+        {
+            $prgHdr->{fileSize} += $delta;
+            $prgHdr->{memSize}  += $delta;
+        }
+    }
+}
+
+# Write out the cubin after modifying it.
+sub write
+{
+    my ($cubin, $file) = @_;
+
+    open my $fh, ">$file" or die "Error: could not open $file for writing: $!";
+    binmode($fh);
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # write elf header
+    print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}};
+    my $pos = $elfHdr->{ehSize};
+
+    # write section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # Skip NULL and NOBITS data sections
+        next if $secHdr->{size} == 0 || $secHdr->{type} == 8;
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pad = $secHdr->{align} - $pad;
+            print $fh join '', "\0" x $pad;
+            $pos += $pad;
+        }
+
+        print $fh pack 'H*', $secHdr->{Data};
+        $pos += $secHdr->{size};
+    }
+
+    # write section headers
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}};
+    }
+
+    #write program headers
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}};
+    }
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/PascalAs/blib/lib/MaxAs/MaxAs.pm b/Assembler/PascalAs/blib/lib/MaxAs/MaxAs.pm
new file mode 100644
index 0000000..f421cf3
--- /dev/null
+++ b/Assembler/PascalAs/blib/lib/MaxAs/MaxAs.pm
@@ -0,0 +1,1407 @@
+package MaxAs::MaxAs;
+
+require 5.10.0;
+
+use strict;
+use Data::Dumper;
+use MaxAs::MaxAsGrammar;
+use File::Spec;
+use Carp;
+
+our $VERSION = '1.06';
+
+# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+my %relOffset  = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT);
+
+# these ops use absolute addresses
+my %absOffset  = map { $_ => 1 } qw(JCAL);
+
+my %jumpOp     = (%relOffset, %absOffset);
+
+# These instructions use r0 but do not write to r0
+my %noDest     = map { $_ => 1 } qw(ST STG STS STL RED);
+
+# Map register slots to reuse control codes
+my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4);
+
+# Preprocess and Assemble a source file
+sub Assemble
+{
+    my ($file, $include, $doReuse, $nowarn) = @_;
+
+    my $regMap = {};
+    $file = Preprocess($file, $include, 0, $regMap);
+    my $vectors = delete $regMap->{__vectors};
+    my $regBank = delete $regMap->{__regbank};
+
+    # initialize cubin counts
+    my $regCnt = 0;
+    my $barCnt = 0;
+
+    my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse);
+
+    # initialize the first control instruction
+    push @instructs, $ctrl = {};
+
+    foreach my $line (split "\n", $file)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        next unless preProcessLine($line);
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # Save us from crashing the display driver
+            die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n"
+                if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0;
+
+            # track branches/jumps/calls/etc for label remapping
+            push @branches, @instructs+0 if exists $jumpOp{$inst->{op}};
+
+            # push the control code onto the control instruction
+            push @{$ctrl->{ctrl}}, $inst->{ctrl};
+
+            # now point the instruction to its associated control instruction
+            $inst->{ctrl} = $ctrl;
+
+            # add the op name and full instruction text
+            push @instructs, $inst;
+
+            # add a 4th control instruction for every 3 instructions
+            push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            # map the label name to the index of the instruction about to be inserted
+            $labels{$1} = @instructs+0;
+        }
+        else
+        {
+            die "badly formed line at $lineNum: $line\n";
+        }
+    }
+    # add the final BRA op and align the number of instructions to a multiple of 8
+    push @{$ctrl->{ctrl}}, 0x007ff;
+    push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' };
+    while (@instructs & 7)
+    {
+        push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        push @{$ctrl->{ctrl}}, 0x007e0;
+        push @instructs, { op => 'NOP', inst => 'NOP;' };
+    }
+
+    # remap labels
+    foreach my $i (@branches)
+    {
+        if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1})
+            { die "instruction has invalid label: $instructs[$i]{inst}"; }
+
+        $instructs[$i]{jump} = $labels{$1};
+
+        if (exists $relOffset{$instructs[$i]{op}})
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; }
+        else
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; }
+    }
+
+    # calculate optimal register reuse
+    # This effects register bank decisions so do it before analyzing register use
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            if ($doReuse)
+            {
+                # get any vector registers for r0
+                my @r0 = getVecRegisters($vectors, $capData);
+
+                # There are 2 reuse slots per register slot
+                # The reuse hash points to most recent instruction index where register was last used in this slot
+
+                # For writes to a register, clear any reuse opportunity
+                if (@r0 && !exists $noDest{$op})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        if (my $reuse = $reuse{$slot})
+                        {
+                            # if writing with a vector op, clear all linked registers
+                            delete $reuse->{$_} foreach @r0;
+                        }
+                    }
+                }
+                # clear cache if jumping elsewhere
+                %reuse = () if exists $jumpOp{$op};
+
+                # only track register reuse for instruction types this works with
+                if ($gram->{type}{reuse})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        next unless exists $capData->{$slot};
+
+                        my $r = $capData->{$slot};
+                        next if $r eq 'RZ';
+                        next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction
+
+                        my $reuse = $reuse{$slot} ||= {};
+
+                        # if this register was previously marked for potential reuse
+                        if (my $p = $reuse->{$r})
+                        {
+                            # flag the previous instruction's ctrl reuse array slot
+                            $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot};
+
+                            #print "reuse $slot $r $instructs[$p]{inst}\n";
+                        }
+                        # list full, delete the oldest
+                        elsif (keys %$reuse > 2)
+                        {
+                            my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0];
+                            delete $reuse->{$oldest};
+                        }
+                        # mark the new instruction for potential reuse
+                        $reuse->{$r} = $i;
+                    }
+                }
+            }
+            # if reuse is disabled then pull value from code.
+            elsif ($gram->{type}{reuse})
+            {
+                $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData);
+            }
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # Assign registers to requested banks if possible
+    foreach my $r (sort keys %$regBank)
+    {
+        my $bank  = $regBank->{$r};
+        my $avail = $regMap->{$r};
+        foreach my $pos (0 .. $#$avail)
+        {
+            if ($bank == ($avail->[$pos] & 3))
+            {
+                # assign it, while removing the assigned register from the pool
+                $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+                last;
+            }
+        }
+    }
+
+    # calculate register live times and preferred banks for non-fixed registers.
+    # LiveTime only half implemented...
+    my (%liveTime, %pairedBanks, %reuseHistory);
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData   = parseInstruct($inst, $gram) or next;
+            my $reuseType = $gram->{type}{reuse};
+
+            # liveTimes and bank conflicts with source operands
+            my (%addReuse, %delReuse);
+            foreach my $slot (qw(r8 r20 r39))
+            {
+                my $r = $capData->{$slot} or next;
+                next if $r eq 'RZ';
+
+                my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r};
+
+                # All registers should be written prior to being read..
+                if (my $liveTime = $liveTime{$liveR})
+                {
+                    # for each read set the current instruction index as the high value
+                    $liveTime->[$#$liveTime][1] = $i;
+                    push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                }
+                else
+                {
+                    warn "register used without initialization ($r): $inst\n" unless $nowarn;
+                    push @{$liveTime{$liveR}}, [$i,$i];
+                }
+
+                # Is this register active in the reuse cache?
+                my $slotHist  = $reuseHistory{$slot} ||= {};
+                my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0;
+
+                #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3';
+
+                # If this is an auto reg, look at the open banks.
+                # No need to look at banks if this register is in the reuse cache.
+                if (!$selfReuse && ref $regMap->{$r})
+                {
+                    # Look at other source operands in this instruction and flag what banks are being used
+                    foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39))
+                    {
+                        my $r2 = $capData->{$slot2};
+                        next if $r2 eq 'RZ' || $r2 eq $r;
+
+                        my $slotHist2 = $reuseHistory{$slot2} ||= {};
+
+                        #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3';
+
+                        # Dont be concerned with non-reuse type instructions or
+                        # If this operand is in the reuse cache, we don't care what bank it's on.
+                        if (!$reuseType || !exists $slotHist2->{$r2})
+                        {
+                            # if the operand is also an auto-allocated register then link them
+                            # Once we choose the bank for one we want to update that choice for the other register.
+                            if (ref $regMap->{$r2})
+                            {
+                                push @{$pairedBanks{$r}{pairs}}, $r2;
+                                $pairedBanks{$r}{banks} ||= [];
+                            }
+                            # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid.
+                            else
+                            {
+                                my $bank = substr($regMap->{$r2},1) & 3;
+                                #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3';
+
+                                $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++;
+                                $pairedBanks{$r}{pairs} ||= [];
+                            }
+                            # Update the total use count for this register.
+                            # This will be the number of times the register is pulled out of the bank.
+                            $pairedBanks{$r}{useCnt}++;
+                        }
+                    }
+                }
+                # update the reuse history so we know which bank conflicts we can ignore.
+                if ($reuseType)
+                {
+                    # flag these slots for addition or removal from reuseHistory
+                    if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot})
+                        { $addReuse{$slot} = $r; }
+                    else
+                        { $delReuse{$slot} = $r; }
+                }
+            }
+            # update reuse history after we're done with the instruction (when the flag is actually in effect).
+            # we don't want to updated it in the middle since that can interfere with the checks,
+            $reuseHistory{$_}{$addReuse{$_}} = 1    foreach keys %addReuse;
+            delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse;
+
+            # liveTimes for destination operands and vector registers
+            foreach my $r0 (getVecRegisters($vectors, $capData))
+            {
+                # fixed register mappings can have aliases so use the actual register value for those.
+                my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0};
+
+                # If not writing treat just like a read
+                if (exists $noDest{$op})
+                {
+                    if (my $liveTime = $liveTime{$liveR})
+                    {
+                        $liveTime->[$#$liveTime][1] = $i;
+                        push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                    }
+                    else
+                    {
+                        warn "register used without initialization ($r0): $inst\n" unless $nowarn;
+                        push @{$liveTime{$liveR}}, [$i,$i];
+                    }
+                }
+                # If writing, push a new bracket on this register's stack.
+                elsif (my $liveTime = $liveTime{$liveR})
+                {
+                    if ($i > $liveTime->[$#$liveTime][1])
+                    {
+                        push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                    }
+                }
+                else
+                {
+                    # Initialize the liveTime stack for this register.
+                    push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                }
+            }
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+    #print Dumper(\%liveTime); exit(1);
+
+    # assign unassigned registers
+    # sort by most restricted, then most used, then name
+    foreach my $r (sort {
+                    $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} ||
+                    $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} ||
+                    $a cmp $b
+                  } keys %pairedBanks)
+    {
+        my $banks = $pairedBanks{$r}{banks};
+        my $avail = $regMap->{$r};
+
+        #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail;
+
+        # Pick a bank with zero or the smallest number of conflicts
+        BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3))
+        {
+            # pick an available register that matches the requested bank
+            foreach my $pos (0 .. $#$avail)
+            {
+                if ($bank == ($avail->[$pos] & 3))
+                {
+                    # assign it, while removing the assigned register from the pool
+                    $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+
+                    # update bank info for any unassigned pair
+                    $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}};
+                    last BANK;
+                }
+            }
+        }
+    }
+    # Now assign any remaining to first available
+    foreach my $r (sort keys %$regMap)
+    {
+        if (ref($regMap->{$r}) eq 'ARRAY')
+        {
+            $regMap->{$r} = 'R' . shift @{$regMap->{$r}};
+        }
+    }
+    #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap;
+
+    # apply the register mapping and assemble the instructions to op codes
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        # save the original and replace the register names with numbers
+        $instructs[$i]{orig} = $instructs[$i]{inst};
+        $instructs[$i]{inst} =~ s/(?<!\.)\b(\w+)\b(?!\[)/ exists($regMap->{$1}) ? $regMap->{$1} : $1 /ge;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            # update the register count
+            foreach my $r (qw(r0 r8 r20 r39))
+            {
+                next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ';
+
+                # get numeric portion of regname
+                my $val = substr $capData->{$r}, 1;
+
+                my @r0 = getVecRegisters($vectors, $capData);
+                my @r8 = getAddrVecRegisters($vectors, $capData);
+
+                # smart enough to count vector registers for memory instructions.
+                my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1;
+                my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1;
+
+                if ($val + $regInc > $regCnt)
+                {
+                    $regCnt = $val + $regInc;
+                    #print "$val $regCnt $regInc\n";
+                }
+            }
+            # update the barrier resource count
+            if ($op eq 'BAR')
+            {
+                if (exists $capData->{i8w4})
+                {
+                    $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt;
+                }
+                # if a barrier value is a register, assume the maximum
+                elsif (exists $capData->{r8})
+                {
+                    $barCnt = 16;
+                }
+            }
+            # Generate the op code.
+            my ($code, $reuse) = genCode($op, $gram, $capData);
+            $instructs[$i]{code} = $code;
+
+            # cache this for final pass when we want to calculate reuse stats.
+            if ($gram->{type}{reuse})
+                { $instructs[$i]{caps} = $capData; }
+            # use the parsed value of reuse for non-reuse type instructions
+            else
+                { $ctrl->{reuse}[($i & 3) - 1] = $reuse; }
+
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # final pass to piece together control codes
+    my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed);
+    foreach my $i (0 .. $#instructs)
+    {
+        # op code
+        if ($i & 3)
+        {
+            push @codes, $instructs[$i]{code};
+
+            if ($instructs[$i]{caps})
+            {
+                # calculate stats on registers
+                registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn);
+            }
+            if ($instructs[$i]{inst} =~ m'EXIT')
+            {
+                push @exitOffsets, (scalar(@codes)-1)*8;
+            }
+            elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)')
+            {
+                push @ctaidOffsets, (scalar(@codes)-1)*8;
+                $ctaidzUsed = 1 if $1 eq 'Z';
+            }
+        }
+        # control code
+        else
+        {
+            my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)};
+            push @codes,
+                ($ctrl->[0] <<  0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes
+                ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59);  # reuse codes
+        }
+    }
+
+    # return the kernel data
+    return {
+        RegCnt       => $regCnt,
+        BarCnt       => $barCnt,
+        ExitOffsets  => \@exitOffsets,
+        CTAIDOffsets => \@ctaidOffsets,
+        CTAIDZUsed   => $ctaidzUsed,
+        ConflictCnt  => $reuseHistory{conflicts},
+        ReuseCnt     => $reuseHistory{reuse},
+        ReuseTot     => $reuseHistory{total},
+        ReusePct     => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0),
+        KernelData   => \@codes,
+    };
+}
+
+# Useful for testing op code coverage of existing code, extracting new codes and flags
+sub Test
+{
+    my ($fh, $printConflicts, $all) = @_;
+
+    my @instructs;
+    my %reuseHistory;
+    my ($pass, $fail) = (0,0);
+
+    while (my $line = <$fh>)
+    {
+        my (@ctrl, @reuse);
+
+        next unless processSassCtrlLine($line, \@ctrl, \@reuse);
+
+        foreach my $fileReuse (@reuse)
+        {
+            $line = <$fh>;
+
+            my $inst = processSassLine($line) or next;
+
+            $inst->{reuse} = $fileReuse;
+            my $fileCode = $inst->{code};
+
+            if (exists $relOffset{$inst->{op}})
+            {
+                # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+                $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e;
+            }
+
+            my $match = 0;
+            foreach my $gram (@{$grammar{$inst->{op}}})
+            {
+                my $capData = parseInstruct($inst->{inst}, $gram) or next;
+                my @caps;
+
+                # Run in test mode to list what capture groups were captured
+                my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps);
+
+                # Detect register bank conflicts but only for reuse type instructions.
+                # If a bank conflict is avoided by a reuse flag then ignore it.
+                registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse};
+
+                $inst->{caps}      = join ', ', sort @caps;
+                $inst->{codeDiff}  = $fileCode  ^ $code;
+                $inst->{reuseDiff} = $fileReuse ^ $reuse;
+
+                # compare calculated and file values
+                if ($code == $fileCode && $reuse == $fileReuse)
+                {
+                    $inst->{grade} = 'PASS';
+                    push @instructs, $inst if $all;
+                    $pass++;
+                }
+                else
+                {
+                    $inst->{grade} = 'FAIL';
+                    push @instructs, $inst;
+                    $fail++;
+                }
+                $match = 1;
+                last;
+            }
+            unless ($match)
+            {
+                $inst->{grade}     = 'FAIL';
+                $inst->{codeDiff}  = $fileCode;
+                $inst->{reuseDiff} = $fileReuse;
+                push @instructs, $inst;
+                $fail++;
+            }
+        }
+    }
+    my %maxLen;
+    foreach (@instructs)
+    {
+        $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}};
+    }
+    my ($lastOp, $template);
+    foreach my $inst (sort {
+        $a->{op}        cmp $b->{op}        ||
+        $a->{codeDiff}  <=> $b->{codeDiff}  ||
+        $a->{reuseDiff} <=> $b->{reuseDiff} ||
+        $a->{ins}       cmp $b->{ins}
+        } @instructs)
+    {
+        if ($lastOp ne $inst->{op})
+        {
+            $lastOp   = $inst->{op};
+            $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s   %s\n";
+            printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s   %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures);
+        }
+        printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)};
+    }
+    my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0;
+
+    printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n",
+        $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total};
+
+    return $fail;
+}
+
+# Convert cuobjdump sass to the working format
+sub Extract
+{
+    my ($in, $out, $params) = @_;
+
+    my %paramMap;
+    my %constants =
+    (
+        blockDimX => 'c[0x0][0x8]',
+        blockDimY => 'c[0x0][0xc]',
+        blockDimZ => 'c[0x0][0x10]',
+        gridDimX  => 'c[0x0][0x14]',
+        gridDimY  => 'c[0x0][0x18]',
+        gridDimZ  => 'c[0x0][0x1c]',
+    );
+    print $out "<CONSTANT_MAPPING>\n";
+
+    foreach my $const (sort keys %constants)
+    {
+        print $out "    $const : $constants{$const}\n";
+        $paramMap{$constants{$const}} = $const;
+    }
+    print $out "\n";
+
+    foreach my $p (@$params)
+    {
+        my ($ord,$offset,$size,$align) = split ':', $p;
+
+        if ($size > 4)
+        {
+            my $num = 0;
+            $offset = hex $offset;
+            while ($size > 0)
+            {
+                my $param = sprintf 'param_%d[%d]', $ord, $num;
+                my $const = sprintf 'c[0x0][0x%x]', $offset;
+                $paramMap{$const} = $param;
+                print $out "    $param : $const\n";
+                $size   -= 4;
+                $offset += 4;
+                $num    += 1;
+            }
+        }
+        else
+        {
+            my $param = sprintf 'param_%d', $ord;
+            my $const = sprintf 'c[0x0][%s]', $offset;
+            $paramMap{$const} = $param;
+            print $out "    $param : $const\n";
+        }
+    }
+    print $out "</CONSTANT_MAPPING>\n\n";
+
+    my %labels;
+    my $labelnum = 1;
+
+    my @data;
+    FILE: while (my $line = <$in>)
+    {
+        my (@ctrl, @ruse);
+        next unless processSassCtrlLine($line, \@ctrl, \@ruse);
+
+        CTRL: foreach my $ctrl (@ctrl)
+        {
+            $line = <$in>;
+
+            my $inst = processSassLine($line) or next CTRL;
+
+            # Convert branch/jump/call addresses to labels
+            if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)')
+            {
+                my $target = hex($1);
+
+                # skip the final BRA and stop processing the file
+                last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8);
+
+                # check to see if we've already generated a label for this target address
+                my $label = $labels{$target};
+                unless ($label)
+                {
+                    # generate a label name and cache it
+                    $label = $labels{$target} = "TARGET$labelnum";
+                    $labelnum++;
+                }
+                # replace address with name
+                $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/;
+            }
+            $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg;
+
+            $inst->{ctrl} = printCtrl($ctrl);
+
+            push @data, $inst;
+        }
+    }
+    # make a second pass now that we have the complete instruction address to label mapping
+    foreach my $inst (@data)
+    {
+        print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}};
+        printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)};
+    }
+}
+
+my $CommentRe  = qr'^[\t ]*<COMMENT>.*?^\s*</COMMENT>\n?'ms;
+my $IncludeRe  = qr'^[\t ]*<INCLUDE\s+file="([^"]+)"\s*/?>\n?'ms;
+my $CodeRe     = qr'^[\t ]*<CODE(\d*)>(.*?)^\s*<\/CODE\1>\n?'ms;
+my $ConstMapRe = qr'^[\t ]*<CONSTANT_MAPPING>(.*?)^\s*</CONSTANT_MAPPING>\n?'ms;
+my $RegMapRe   = qr'^[\t ]*<REGISTER_MAPPING>(.*?)^\s*</REGISTER_MAPPING>\n?'ms;
+my $ScheduleRe = qr'^[\t ]*<SCHEDULE_BLOCK>(.*?)^\s*</SCHEDULE_BLOCK>\n?'ms;
+my $InlineRe   = qr'\[(\+|\-)(.+?)\1\]'ms;
+
+sub IncludeFile
+{
+    my ($file, $include) = @_;
+    my ($vol,$dir,$name) = File::Spec->splitpath($file);
+    local $/;
+    my $fh;
+    if (!open $fh, $file)
+    {
+        open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n";
+    }
+    my $content = <$fh>;
+    close $fh;
+    return $content;
+}
+
+sub Preprocess
+{
+    my ($file, $include, $debug, $regMap) = @_;
+
+    my $constMap = {};
+    my $removeRegMap;
+    if ($regMap)
+        { $removeRegMap = 1; }
+    else
+        { $regMap = {}; }
+
+    # include nested files
+    1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg;
+
+    # Strip out comments
+    $file =~ s|$CommentRe||g;
+
+    # Execute the CODE sections (old way to run code, to be deprecated)
+    1 while $file =~ s|$CodeRe|
+        my $out = eval "package MaxAs::MaxAs::CODE; $2";
+        $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg;
+
+    # Execute the inline code (new way)
+    $file =~ s|$InlineRe|
+        my ($type, $code) = ($1, $2);
+        my $out = eval "package MaxAs::MaxAs::CODE; $code";
+        $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg;
+
+    #Pull in the constMap
+    $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg;
+
+    my @newFile;
+    foreach my $line (split "\n", $file)
+    {
+        # skip comments
+        if ($line !~ m'^\s*(?:#|//).*')
+        {
+            $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg;
+        }
+        push @newFile, $line;
+    }
+    $file = join "\n", @newFile;
+
+    # Pull in the reg map first as the Scheduler will need it to handle vector instructions
+    # Remove the regmap if we're going on to assemble
+    $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg;
+
+    # Pick out the SCHEDULE_BLOCK sections
+    my @schedBlocks = $file =~ /$ScheduleRe/g;
+
+    # Schedule them
+    foreach my $i (0 .. $#schedBlocks)
+    {
+        # XMAD macros should only appear in SCHEDULE_BLOCKs
+        $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]);
+
+        $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug);
+    }
+
+    # Replace the results
+    $file =~ s|$ScheduleRe| shift @schedBlocks |eg;
+
+    return $file;
+}
+
+# break the registers down into source and destination categories for the scheduler
+my %srcReg   = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X);
+my %destReg  = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC);
+my %regops   = (%srcReg, %destReg);
+my @itypes   = qw(class lat rlat tput dual);
+
+sub Scheduler
+{
+    my ($block, $blockNum, $regMap, $debug) = @_;
+
+    my $vectors = $regMap->{__vectors};
+    my $lineNum = 0;
+
+    my (@instructs, @comments, $ordered, $first);
+    foreach my $line (split "\n", $block)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        unless (preProcessLine($line))
+        {
+            push @comments, $line if $line =~ m'\S';
+            next;
+        }
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # if the first instruction in the block is waiting on a dep, it should go first.
+            $inst->{first}   = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1;
+
+            # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block)
+            #$inst->{first}   = $inst->{ctrl} & 0x0000f ? 1 : 2;
+            $inst->{exeTime} = 0;
+            $inst->{order}   = $ordered++ if $ordered;
+            push @instructs, $inst;
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n";
+        }
+        # open an ORDERED block
+        elsif ($line =~ m'^<ORDERED>')
+        {
+            die "you cannot use nested <ORDERED> tags" if $ordered;
+            $ordered = 1;
+        }
+        # close an ORDERED block
+        elsif ($line =~ m'^</ORDERED>')
+        {
+            die "missing opening <ORDERED> for closing </ORDERED> tag" if !$ordered;
+            $ordered = 0;
+        }
+        else
+        {
+            die "badly formed line at block: $blockNum line: $lineNum: $line\n";
+        }
+    }
+
+    my (%writes, %reads, @ready, @schedule, $orderedParent);
+    # assemble the instructions to op codes
+    foreach my $instruct (@instructs)
+    {
+        my $match = 0;
+        foreach my $gram (@{$grammar{$instruct->{op}}})
+        {
+            my $capData = parseInstruct($instruct->{inst}, $gram) or next;
+            my (@dest, @src);
+
+            # copy over instruction types for easier access
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+
+            # A predicate prefix is treated as a source reg
+            push @src, $instruct->{predReg} if $instruct->{pred};
+
+            # Handle P2R and R2P specially
+            if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7})
+            {
+                my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src;
+                my $mask = hex($capData->{i20w7});
+                foreach my $p (0..6)
+                {
+                    if ($mask & (1 << $p))
+                    {
+                        push @$list, "P$p";
+                    }
+                    # make this instruction dependent on any predicates it's not setting
+                    # this is to prevent a race condition for any predicate sets that are pending
+                    elsif ($instruct->{op} eq 'R2P')
+                    {
+                        push @src, "P$p";
+                    }
+                }
+                # These instructions can't be dual issued
+                $instruct->{nodual} = 1;
+            }
+
+            # Populate our register source and destination lists, skipping any zero or true values
+            foreach my $operand (grep { exists $regops{$_} } sort keys %$capData)
+            {
+                # figure out which list to populate
+                my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src;
+
+                # Filter out RZ and PT
+                my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT';
+
+                if ($capData->{$operand} ne $badVal)
+                {
+                    # add the value to list with the correct prefix
+                    push @$list,
+                        $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) :
+                        $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) :
+                        $operand eq 'CC' ? 'CC' :
+                        $operand eq 'X'  ? 'CC' :
+                        getRegNum($regMap, $capData->{$operand});
+                }
+            }
+            $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39});
+
+            # Find Read-After-Write dependencies
+            foreach my $src (grep { exists $writes{$_} } @src)
+            {
+                # Memory operations get delayed access to registers but not to the predicate
+                my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat};
+
+                # the parent should be the most recently added dest op to the stack
+                foreach my $parent (@{$writes{$src}})
+                {
+                    # add this instruction as a child of the parent
+                    # set the edge to the total latency of reg source availability
+                    #print "R $parent->{inst}\n\t\t$instruct->{inst}\n";
+                    my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat};
+                    push @{$parent->{children}}, [$instruct, $latency - $regLatency];
+                    $instruct->{parents}++;
+
+                    # if the destination was conditionally executed, we also need to keep going back till it wasn't
+                    last unless $parent->{pred};
+                }
+            }
+
+            # Find Write-After-Read dependencies
+            foreach my $dest (grep { exists $reads{$_} } @dest)
+            {
+                # Flag this instruction as dependent to any previous read
+                foreach my $reader (@{$reads{$dest}})
+                {
+                    # no need to stall for these types of dependencies
+                    #print "W $reader->{inst} \t\t\t $instruct->{inst}\n";
+                    push @{$reader->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                # Once dependence is marked we can clear out the read list (unless this write was conditional).
+                # The assumption here is that you would never want to write out a register without
+                # subsequently reading it in some way prior to writing it again.
+                delete $reads{$dest} unless $instruct->{pred};
+            }
+
+            # Enforce instruction ordering where requested
+            if ($instruct->{order})
+            {
+                if ($orderedParent)
+                {
+                    push @{$orderedParent->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                $orderedParent = $instruct;
+            }
+            elsif ($orderedParent)
+                {  $orderedParent = 0; }
+
+            # For a dest reg, push it onto the write stack
+            unshift @{$writes{$_}}, $instruct foreach @dest;
+
+            # For a src reg, push it into the read list
+            push @{$reads{$_}}, $instruct foreach @src;
+
+            # if this instruction has no dependencies it's ready to go
+            push @ready, $instruct if !exists $instruct->{parents};
+
+            $match = 1;
+            last;
+        }
+        die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match;
+    }
+    %writes = ();
+    %reads  = ();
+
+    if (@ready)
+    {
+        # update dependent counts for sorting hueristic
+        my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" };
+
+        countUniqueDescendants($readyParent, {});
+        updateDepCounts($readyParent, {});
+
+        # sort the initial ready list
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    # Process the ready list, adding new instructions to the list as we go.
+    my $clock = 0;
+    while (my $instruct = shift @ready)
+    {
+        my $stall = $instruct->{stall};
+
+        # apply the stall to the previous instruction
+        if (@schedule && $stall < 16)
+        {
+            my $prev = $schedule[$#schedule];
+
+            # if stall is greater than 4 then also yield
+            # the yield flag is required to get stall counts 12-15 working correctly.
+            $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0;
+            $prev->{ctrl} |= $stall;
+            $clock += $stall;
+        }
+        # For stalls bigger than 15 we assume the user is managing it with a barrier
+        else
+        {
+            $instruct->{ctrl} &= 0x1fff0;
+            $instruct->{ctrl} |= 1;
+            $clock += 1;
+        }
+        print "$clock: $instruct->{inst}\n" if $debug;
+
+        # add a new instruction to the schedule
+        push @schedule, $instruct;
+
+        # update each child with a new earliest execution time
+        if (my $children = $instruct->{children})
+        {
+            foreach (@$children)
+            {
+                my ($child, $latency) = @$_;
+
+                # update the earliest clock value this child can safely execute
+                my $earliest = $clock + $latency;
+                $child->{exeTime} = $earliest if $child->{exeTime} < $earliest;
+
+                print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug;
+
+                # decrement parent count and add to ready queue if none remaining.
+                push @ready, $child if --$child->{parents} < 1;
+            }
+            delete $instruct->{children};
+        }
+
+        # update stall and mix values in the ready queue on each iteration
+        foreach my $ready (@ready)
+        {
+            # calculate how many instructions this would cause the just added instruction to stall.
+            $stall = $ready->{exeTime} - $clock;
+            $stall = 1 if $stall < 1;
+
+            # if using the same compute resource as the prior instruction then limit the throughput
+            if ($ready->{class} eq $instruct->{class})
+            {
+                $stall = $ready->{tput} if $stall < $ready->{tput};
+            }
+            # dual issue with a simple instruction (tput <= 2)
+            # can't dual issue two instructions that both load a constant
+            elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} &&
+                   $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const}))
+            {
+                $stall = 0;
+            }
+            $ready->{stall} = $stall;
+
+            # add an instruction class mixing huristic that catches anything not handled by the stall
+            $ready->{mix} = $ready->{class} ne $instruct->{class} || 0;
+        }
+
+        # sort the ready list by stall time, mixing huristic, dependencies and line number
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $a->{stall}   <=> $b->{stall}  ||
+            $b->{mix}     <=> $a->{mix}    ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    my $out;
+    #$out .= "$_\n" foreach @comments;
+    $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule;
+    return $out;
+}
+
+sub setConstMap
+{
+    my ($constMap, $constMapText) = @_;
+
+    foreach my $line (split "\n", $constMapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my ($name, $value) = split '\s*:\s*', $line;
+
+        $constMap->{$name} = $value;
+    }
+    return;
+}
+
+sub setRegisterMap
+{
+    my ($regMap, $regmapText) = @_;
+
+    my $vectors = $regMap->{__vectors} ||= {};
+    my $regBank = $regMap->{__regbank} ||= {};
+    my %aliases;
+
+    foreach my $line (split "\n", $regmapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my $auto  = $line =~ /~/;
+        my $share = $line =~ /=/;
+
+        my ($regNums, $regNames) = split '\s*[:~=]\s*', $line;
+
+        my (@numList, @nameList, %vecAliases);
+        foreach my $num (split '\s*,\s*', $regNums)
+        {
+            my ($start, $stop) = split '\s*\-\s*', $num;
+            die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop;
+            push @numList, ($start .. $stop||$start);
+        }
+        foreach my $fullName (split '\s*,\s*', $regNames)
+        {
+            if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$')
+            {
+                my ($name1, $name2, $bank) = ($1, $3, $4);
+                foreach (split '\s*\|\s*', $2)
+                {
+                    my ($start, $stop) = split '\s*\-\s*';
+                    foreach my $r (map "$name1$_$name2", $start .. $stop||$start)
+                    {
+                        # define an alias for use in vector instructions that omits the number portion
+                        $aliases{$r} = "$name1$name2" unless exists $aliases{$r};
+                        push @nameList, $r;
+                        $regBank->{$r} = $bank if $auto && defined $bank;
+                        warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank;
+                    }
+                }
+            }
+            elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$')
+            {
+                push @nameList, $1;
+                $regBank->{$1} = $2 if $auto && defined $2;
+                warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2;
+            }
+            else
+            {
+                die "Bad register name: '$fullName' at: $line\n";
+            }
+        }
+        die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList;
+        die "Missmatched register mapping at: $line\n" if $share && @numList > 1;
+
+        # detect if this list is monotonically ascending with no gaps
+        my $i = 0;
+        while ($i < $#numList-1)
+        {
+            last if $numList[$i] + 1 != $numList[$i+1];
+            $i++;
+        }
+        my $ascending = $i+1 == $#numList;
+
+        foreach my $n (0..$#nameList)
+        {
+            die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]};
+
+            if ($auto)
+            {
+                # assign possible values to be assigned on assembly
+                $regMap->{$nameList[$n]} = \@numList;
+            }
+            elsif ($share)
+            {
+                # each name shares the same single register
+                $regMap->{$nameList[$n]} = 'R' . $numList[0];
+            }
+            else
+            {
+                $regMap->{$nameList[$n]} = 'R' . $numList[$n];
+                # flag any even register as a potential vector
+                if ($ascending && ($numList[$n] & 1) == 0)
+                {
+                    # constrain potential range to vector alignment
+                    my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3);
+                    if ($end <= $#nameList)
+                    {
+                        $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ];
+                        #setup an alias for the base name without the number
+                        if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}})
+                        {
+                            $regMap->{$aliases{$nameList[$n]}}  = $regMap->{$nameList[$n]};
+                            $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]};
+                            delete $aliases{$nameList[$n]};
+                        }
+                    }
+                }
+            }
+        }
+    }
+    #print Dumper($regMap); exit(1);
+}
+
+sub preProcessLine
+{
+    # strip leading space
+    $_[0] =~ s|^\s+||;
+
+    # preserve comment but check for emptiness
+    my $val = shift;
+
+    # strip comments
+    $val =~ s{(?:#|//).*}{};
+
+    # skip blank lines
+    return $val =~ m'\S';
+}
+
+# traverse the graph and count total descendants per node.
+# only count unique nodes (by lineNum)
+sub countUniqueDescendants
+{
+    my ($node, $edges) = @_;
+
+    #warn "$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+
+            $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges);
+        }
+    }
+    else
+    {
+        return $node->{lineNum};
+    }
+    return ($node->{lineNum}, keys %{$node->{deps}});
+}
+# convert hash to count for easier sorting.
+sub updateDepCounts
+{
+    my ($node, $edges) = @_;
+
+    #warn "$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (@$children)
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+            updateDepCounts($child->[0], $edges);
+        }
+    }
+    $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0;
+}
+
+# Detect register bank conflicts and calculate reuse stats
+sub registerHealth
+{
+    my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_;
+
+    my (@banks, @conflicts);
+
+    foreach my $slot (qw(r8 r20 r39))
+    {
+        my $r = $capData->{$slot} or next;
+        next if $r eq 'RZ';
+
+        my $slotHist = $reuseHistory->{$slot} ||= {};
+
+        $reuseHistory->{total}++;
+
+        # if this register is in active reuse then ignore for bank conflict checking.
+        if (exists $slotHist->{$r})
+        {
+            $reuseHistory->{reuse}++;
+        }
+        else
+        {
+            # extract number from reg and take the modulo-4 value.  This is the bank id.
+            my $bank = substr($r,1) & 3;
+
+            # check for conflict
+            if ($banks[$bank] && $banks[$bank] ne $r)
+            {
+                push @conflicts, $banks[$bank] if !@conflicts;
+                push @conflicts, $r;
+
+                $reuseHistory->{conflicts}++;
+            }
+            $banks[$bank] = $r;
+        }
+
+        # update the history
+        if ($reuseFlags & $reuseSlots{$slot})
+            { $slotHist->{$r} = 1; }
+        else
+            { delete $slotHist->{$r};  }
+    }
+    if ($inst && @conflicts && !$nowarn)
+    {
+        printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts);
+    }
+    return scalar @conflicts;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+MaxAs::MaxAs - Assembler for NVIDIA Maxwell architecture
+
+=head1 SYNOPSIS
+
+    maxas.pl [opts]
+
+=head1 DESCRIPTION
+
+See the documentation at: https://github.com/NervanaSystems/maxas
+
+=head1 SEE ALSO
+
+See the documentation at: https://github.com/NervanaSystems/maxas
+
+
+=head1 AUTHOR
+
+Scott Gray, E<lt>sgray@nervanasys.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+=cut
diff --git a/Assembler/PascalAs/blib/lib/MaxAs/MaxAsGrammar.pm b/Assembler/PascalAs/blib/lib/MaxAs/MaxAsGrammar.pm
new file mode 100644
index 0000000..fc61543
--- /dev/null
+++ b/Assembler/PascalAs/blib/lib/MaxAs/MaxAsGrammar.pm
@@ -0,0 +1,1437 @@
+package MaxAs::MaxAsGrammar;
+
+use strict;
+use Carp;
+use Exporter;
+use Data::Dumper;
+our @ISA = qw(Exporter);
+
+our @EXPORT = qw(
+    %grammar %flags
+    parseInstruct genCode genReuseCode
+    processAsmLine processSassLine processSassCtrlLine
+    replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters
+);
+
+require 5.10.0;
+
+# Helper functions for operands
+sub getI
+{
+    my ($orig, $pos, $mask) = @_;
+    my $val = $orig;
+    my $neg = $val =~ s|^\-||;
+
+    # parse out our custom index immediates for addresses
+    if ($val  =~ m'^(\d+)[xX]<([^>]+)>')
+    {
+        # allow any perl expression and multiply result by leading decimal.
+        # also allow global scalar varibles in the expression.
+        my $mul = $1;
+        my $exp = $2;
+        # strip leading zeros (don't interpret numbers as octal)
+        $exp =~ s/(?<!\d)0+(?=[1-9])//g;
+        my @globals = $exp =~ m'\$\w+'g;
+        my $our = @globals ? ' our (' . join(',',@globals) . ');' : '';
+        $val = $mul * eval "package MaxAs::MaxAs::CODE;$our $exp";
+        #print "$val = $mul x $exp\n"; # if $our;
+    }
+    # hexidecial value
+    elsif ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # otherwise val is a simple decimal value that doesn't need to be modified
+
+    if ( $neg )
+    {
+        # if the mask removes the sign bit the "neg" flag adds it back on the code somewhere else
+        $val = -$val;
+        $val &= $mask;
+    }
+    if (($val & $mask) != $val)
+    {
+        die sprintf "Immediate value out of range(0x%x): 0x%x ($orig)\n", $mask, $val;
+    }
+    return $val << $pos;
+}
+sub getF
+{
+    my ($val, $pos, $type, $trunc) = @_;
+    # hexidecial value
+    if ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # support infinity
+    elsif ($val =~ m'INF'i)
+    {
+        $val = $trunc ? ($type eq 'f' ? 0x7f800 : 0x7ff00) : 0x7f800000;
+    }
+    else
+    {
+        $val = unpack(($type eq 'f' ? 'L' : 'Q'), pack $type, $val);
+
+        # strip off sign bit if truncating.  It will added elsewhere in the code by the flag capture.
+        $val = ($val >> $trunc) & 0x7ffff if $trunc;
+    }
+    return $val << $pos;
+}
+sub getR
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^R(\d+|Z)$' && $1 < 255)
+    {
+        $val = $1 eq 'Z' ? 0xff : $1;
+    }
+    else
+    {
+        die "Bad register name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getP
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^P(\d|T)$' && $1 < 7)
+    {
+        $val = $1 eq 'T' ? 7 : $1;
+    }
+    else
+    {
+        die "Bad predicate name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 }
+
+# Map operands into their value and position in the op code.
+my %operands =
+(
+    p0      => sub { getP($_[0], 0)  },
+    p3      => sub { getP($_[0], 3)  },
+    p12     => sub { getP($_[0], 12) },
+    p29     => sub { getP($_[0], 29) },
+    p39     => sub { getP($_[0], 39) },
+    p45     => sub { getP($_[0], 45) },
+    p48     => sub { getP($_[0], 48) },
+    p58     => sub { getP($_[0], 58) },
+    r0      => sub { getR($_[0], 0)  },
+    r8      => sub { getR($_[0], 8)  },
+    r20     => sub { getR($_[0], 20) },
+    r28     => sub { getR($_[0], 28) },
+    r39s20  => sub { getR($_[0], 39) },
+    r39     => sub { getR($_[0], 39) },
+    r39a    => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20
+    c20     => sub { getC($_[0])     },
+    c39     => sub { getC($_[0])     },
+    c34     => sub { hex($_[0]) << 34 },
+    c36     => sub { hex($_[0]) << 36 },
+    f20w32  => sub { getF($_[0], 20, 'f')        },
+    f20     => sub { getF($_[0], 20, 'f', 12)    },
+    d20     => sub { getF($_[0], 20, 'd', 44)    },
+    i8w4    => sub { getI($_[0], 8,  0xf)        },
+    i20     => sub { getI($_[0], 20, 0x7ffff)    },
+    i20w6   => sub { getI($_[0], 20, 0x3f)       },
+    i20w7   => sub { getI($_[0], 20, 0x7f)       },
+    i20w8   => sub { getI($_[0], 20, 0xff)       },
+    i20w12  => sub { getI($_[0], 20, 0xfff)      },
+    i20w24  => sub { getI($_[0], 20, 0xffffff)   },
+    i20w32  => sub { getI($_[0], 20, 0xffffffff) },
+    i31w4   => sub { getI($_[0], 31, 0xf)        },
+    i34w13  => sub { getI($_[0], 34, 0x1fff)     },
+    i36w20  => sub { getI($_[0], 36, 0xfffff)    },
+    i39w8   => sub { getI($_[0], 39, 0xff)       },
+    i28w8   => sub { getI($_[0], 28, 0xff)       },
+    i28w20  => sub { getI($_[0], 28, 0xfffff)    },
+    i48w8   => sub { getI($_[0], 48, 0xff)       },
+    i51w5   => sub { getI($_[0], 51, 0x1f)       },
+    i53w5   => sub { getI($_[0], 53, 0x1f)       },
+);
+
+# Rules for operands and their closely tied flags
+my $hex     = qr"0[xX][0-9a-fA-F]+";
+my $iAddr   = qr"\d+[xX]<[^>]+>";
+my $immed   = qr"$hex|$iAddr|\d+"o;
+my $reg     = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\
+my $p       = qr"P[0-6T]";
+my $noPred  = qr"(?<noPred>)";
+my $pred    = qr"\@(?<predNot>\!)?P(?<predNum>[0-6]) ";
+my $p0      = qr"(?<p0>$p)"o;
+my $p3      = qr"(?<p3>$p)"o;
+my $p12     = qr"(?<p12not>\!)?(?<p12>$p)"o;
+my $p29     = qr"(?<p29not>\!)?(?<p29>$p)"o;
+my $p39     = qr"(?<p39not>\!)?(?<p39>$p)"o;
+my $p45     = qr"(?<p45>$p)"o;
+my $p48     = qr"(?<p48>$p)"o;
+my $p58     = qr"(?<p58>$p)"o;
+my $r0      = qr"(?<r0>$reg)";
+my $r0cc    = qr"(?<r0>$reg)(?<CC>\.CC)?";
+my $r8      = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse1>\.reuse)?";
+my $r20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse2>\.reuse)?";
+my $r28     = qr"(?<r28>$reg)";
+my $r39s20  = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r39s20>(?<r20>$reg))\|?(?:\.(?<r39part>H0|H1))?(?<reuse2>\.reuse)?";
+my $r39     = qr"(?<r39neg>\-)?(?<r39>$reg)(?:\.(?<r39part>H0|H1))?(?<reuse3>\.reuse)?";
+my $r39a    = qr"(?<r39a>(?<r39>$reg))(?<reuse3>\.reuse)?";
+my $c20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3))?"o;
+my $c20x    = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20partx>H0|H1|B0|B1|B2|B3))?"o;
+my $c20s39  = qr"(?<r39neg>\-)?c\[(?<c34>$hex)\]\s*\[(?<c39>$hex)\]"o;
+my $f20w32  = qr"(?<f20w32>(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))";
+my $f20     = qr"(?<f20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $d20     = qr"(?<d20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $i8w4    = qr"(?<i8w4>$immed)"o;
+my $i20     = qr"(?<i20>(?<neg>\-)?$immed)(?<r20neg>\.NEG)?"o;
+my $i20w6   = qr"(?<i20w6>$immed)"o;
+my $i20w7   = qr"(?<i20w7>$immed)"o;
+my $i20w8   = qr"(?<i20w8>$immed)"o;
+my $i20w12  = qr"(?<i20w12>$immed)"o;
+my $i20w24  = qr"(?<i20w24>\-?$immed)"o;
+my $i20w32  = qr"(?<i20w32>\-?$immed)"o;
+my $i39w8   = qr"(?<i39w8>\-?$immed)"o;
+my $i28w8   = qr"(?<i28w8>$immed)"o;
+my $i28w20  = qr"(?<i28w20>\-?$immed)"o;
+my $i31w4   = qr"(?<i31w4>$immed)"o;
+my $i34w13  = qr"(?<i34w13>$immed)"o;
+my $i36w20  = qr"(?<i36w20>$immed)"o;
+my $i48w8   = qr"(?<i48w8>$immed)"o;
+my $i51w5   = qr"(?<i51w5>$immed)"o;
+my $i53w5   = qr"(?<i53w5>$immed)"o;
+my $ir20    = qr"$i20|$r20"o;
+my $cr20    = qr"$c20|$r20"o;
+my $icr20   = qr"$i20|$c20|$r20"o;
+my $fcr20   = qr"$f20|$c20|$r20"o;
+my $cr39    = qr"$c20s39|$r39"o;
+my $dr20    = qr"$d20|$r20"o;
+
+# Instruction specific rules for capturing various flags
+my $u32   = qr"(?<U32>\.U32)?";
+my $ftz   = qr"(?<FTZ>\.FTZ)?";
+my $sat   = qr"(?<SAT>\.SAT)?";
+my $rnd   = qr"(?:\.(?<rnd>RN|RM|RP|RZ))?";
+my $round = qr"(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?";
+my $fcmp  = qr"(?<cmp>\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)";
+my $icmp  = qr"\.(?<cmp>LT|EQ|LE|GT|NE|GE)";
+my $bool  = qr"\.(?<bool>AND|OR|XOR|PASS_B)";
+my $bool2 = qr"\.(?<bool2>AND|OR|XOR)";
+my $func  = qr"\.(?<func>COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)";
+my $rro   = qr"\.(?<func>SINCOS|EX2)";
+my $add3  = qr"(?:\.(?<type>X|RS|LS))?";
+my $lopz  = qr"(?:\.(?<z>NZ|Z) $p48,|(?<noz>))"o;
+my $X     = qr"(?<X>\.X)?";
+my $tld   = qr"(?<NODEP>NODEP\.)?(?:(?<reuse1>T)|(?<reuse2>P))";
+my $chnls = qr"(?<chnls>R|RGBA)";
+my $sr    = qr"SR_(?<sr>\S+)";
+my $shf   = qr"(?<W>\.W)?(?:\.(?<type>U64|S64))?(?<HI>\.HI)?";
+my $xmad  = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<mode>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $xmadc = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<modec>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $vmad8 = qr"\.(?<sign1>[SU])(?<size1>8|16)\.(?<sign2>[SU])(?<size2>8|16)(?<PO>\.PO)?(?<SHR_7>\.SHR_7)?(?<SHR_15>\.SHR_15)?(?<SAT>\.SAT)?";
+my $vmad16= qr"\.(?<sign1>[SU])(?<size1>16)\.(?<sign2>[SU])(?<size2>16)";
+my $hilo  = qr"(?:\.(?<mode>XHI|XLO))?";
+my $vaddType = qr"(?:\.(?<UD>UD))?(?:\.(?<SD>SD))?(?:\.(?<sign1>[SU])(?<size1>8|16|32))?(?:\.(?<sign2>[SU])(?<size2>8|16|32))?";
+my $vaddMode = qr"(?:\.(?<mode>MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?";
+my $vmnmx = qr"(?:\.(?<MX>MX))?";
+my $x2x   = qr"\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)";
+my $prmt  = qr"(?:\.(?<mode>F4E|B4E|RC8|ECL|ECR|RC16))?";
+my $shfl  = qr"\.(?<mode>IDX|UP|DOWN|BFLY)";
+my $bar   = qr"\.(?<mode>SYNC|ARV|RED)(?:\.(?<red>POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?(<r20>)|(?<nor20>))(?(<red>), $p39|(?<nop39>))"o;
+my $b2r   = qr"\.RESULT $r0(?:, $p45|(?<nop45>))"o;
+my $dbar  = qr"(?<SB>SB0|SB1|SB2|SB3|SB4|SB5)";
+my $dbar2 = qr" {(?<db5>5)?,?(?<db4>4)?,?(?<db3>3)?,?(?<db2>2)?,?(?<db1>1)?,?(?<db0>0)?}";
+my $mbar  = qr"\.(?<mode>CTA|GL|SYS)";
+my $addr  = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i20w24)?\]"o;
+my $addr2 = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i28w20)?\]"o;
+my $ldc   = qr"c\[(?<c36>$hex)\]\s*$addr"o;
+my $atom  = qr"(?<E>\.E)?(?:\.(?<mode>ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?<type>|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)";
+my $vote  = qr"\.(?<mode>ALL|ANY|EQ)"o;
+my $memType  = qr"(?<type>\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)";
+my $memCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|CI|CS|CV|IL|WT))?";
+
+
+
+# class: hardware resource that shares characteristics with types
+# lat  : pipeline depth where relevent, placeholder for memory ops
+# blat : barrier latency, typical fetch time for memory operations. Highly variable.
+# rlat : operand read latency for memory ops
+# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op.
+# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession.
+# dual : whether this instruction type can be dual issued
+# reuse: whether this instruction type accepts register reuse flags.
+
+# Some of these values are guesses and need to be updated from micro benchmarks.
+# We may need to split these classes up further.
+my $s2rT  = {class => 's2r',   lat => 2,   blat => 25,  rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $smemT = {class => 'mem',   lat => 2,   blat => 30,  rlat => 2, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $gmemT = {class => 'mem',   lat => 2,   blat => 200, rlat => 4, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $x32T  = {class => 'x32',   lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 1};
+my $x64T  = {class => 'x64',   lat => 2,   blat => 128, rlat => 0, rhold => 0,  tput => 128, dual => 0, reuse => 1};
+my $shftT = {class => 'shift', lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $cmpT  = {class => 'cmp',   lat => 13,  blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $qtrT  = {class => 'qtr',   lat => 8,   blat => 0,   rlat => 4, rhold => 0,  tput => 1,   dual => 1, reuse => 0};
+my $rroT  = {class => 'rro',   lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $voteT = {class => 'vote',  lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+
+
+# Create map of op names to rules
+our %grammar =
+(
+    #Floating Point Instructions
+    FADD     => [ { type => $x32T,  code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FADD32I  => [ { type => $x32T,  code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FCHK     => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o,                     } ], #Partial?
+    FCMP     => [ { type => $cmpT,  code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o,            } ],
+    FFMA     => [
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o,         },
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o,     },
+                ],
+    FMNMX    => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o,                } ],
+    FMUL     => [ { type => $x32T,  code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FMUL32I  => [ { type => $x32T,  code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FSET     => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o,       } ],
+    FSETP    => [ { type => $cmpT,  code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ],
+    MUFU     => [ { type => $qtrT,  code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o,                              } ],
+    RRO      => [ { type => $rroT,  code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o,                               } ],
+    DADD     => [ { type => $x64T,  code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o,                        } ],
+    DFMA     => [ { type => $x64T,  code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o,                  } ],
+    DMNMX    => [ { type => $cmpT,  code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o,                     } ],
+    DMUL     => [ { type => $x64T,  code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o,                        } ],
+    DSET     => [ { type => $cmpT,  code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o,            } ],
+    DSETP    => [ { type => $cmpT,  code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o,      } ],
+    FSWZADD  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o,                                    } ], #TODO
+
+    HADD2     => [ { type => $x32T,  code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o,               } ],
+    HMUL2     => [ { type => $x32T,  code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o,               } ],
+    HFMA2     => [ { type => $x32T,  code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o,         } ],
+    HSETP2    => [ { type => $cmpT,  code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial
+
+    #Integer Instructions
+    BFE       => [ { type => $shftT,  code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o,                          } ],
+    BFI       => [ { type => $shftT,  code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o,                        } ],
+    FLO       => [ { type => $s2rT,   code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o,                              } ],
+    IADD      => [ { type => $x32T,   code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o,                         } ],
+    IADD32I   => [ { type => $x32T,   code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o,                         } ],
+    IADD3     => [ { type => $x32T,   code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o,                 } ],
+    ICMP      => [ { type => $cmpT,   code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o,              } ],
+    IMNMX     => [ { type => $shftT,  code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o,                  } ],
+    ISET      => [ { type => $shftT,  code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o,       } ],
+    ISETP     => [ { type => $cmpT,   code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ],
+    ISCADD    => [ { type => $shftT,  code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o,                   } ],
+    ISCADD32I => [ { type => $shftT,  code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o,               } ],
+    LEA       => [
+                   { type => $cmpT,   code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o,                      },
+                   { type => $shftT,  code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o,                    },
+                   { type => $shftT,  code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o,          },
+                   { type => $shftT,  code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o,          },
+                 ],
+    LOP       => [ { type => $x32T,   code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?<INV>~)?$icr20(?<INV>\.INV)?;"o, } ],
+    LOP32I    => [ { type => $x32T,   code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o,                     } ],
+    LOP3      => [
+                   { type => $x32T,   code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o,            },
+                   { type => $x32T,   code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o,            },
+                 ],
+    POPC      => [ { type => $s2rT,   code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o,                                    } ],
+    SHF       => [
+                   { type => $shftT,  code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o,                  },
+                   { type => $shftT,  code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o,                  },
+                 ],
+    SHL       => [ { type => $shftT,  code => 0x5c48000000000000, rule => qr"^$pred?SHL(?<W>\.W)? $r0, $r8, $icr20;"o,                    } ],
+    SHR       => [ { type => $shftT,  code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o,                          } ],
+    XMAD      => [
+                   { type => $x32T,   code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o,                 },
+                   { type => $x32T,   code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o,            },
+                   { type => $x32T,   code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o,                  },
+                 ],
+    # XMAD replaces these
+    IMAD      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o,   } ], #TODO
+    IMADSP    => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO
+    IMUL      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o,   } ], #TODO
+
+    #Conversion Instructions
+    F2F => [ { type => $qtrT,  code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ],
+    F2I => [ { type => $qtrT,  code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o,         } ],
+    I2F => [ { type => $qtrT,  code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o,               } ],
+    I2I => [ { type => $qtrT,  code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o,               } ],
+
+    #Movement Instructions
+    MOV    => [ { type => $x32T,  code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o,                   } ],
+    MOV32I => [ { type => $x32T,  code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o,   } ],
+    PRMT   => [ { type => $x32T,  code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ],
+    SEL    => [ { type => $x32T,  code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o,        } ],
+    SHFL   => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ],
+
+    #Predicate/CC Instructions
+    PSET   => [ { type => $cmpT,  code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o,       } ],
+    PSETP  => [ { type => $cmpT,  code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ],
+    CSET   => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o,  } ], #TODO
+    CSETP  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO
+    P2R    => [ { type => $x32T,  code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o,   } ],
+    R2P    => [ { type => $cmpT,  code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o,   } ],
+
+    #Texture Instructions
+    # Handle the commonly used 1D texture functions.. but save the others for later
+    TLD    => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial
+    TLDS   => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial
+    TEX    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o,   } ], #TODO
+    TLD4   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o,  } ], #TODO
+    TXQ    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o,   } ], #TODO
+    TEXS   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o,  } ], #TODO
+    TLD4S  => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO
+
+    #Compute Load/Store Instructions
+    LD     => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o,      } ],
+    ST     => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o,      } ],
+    LDG    => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o,           } ],
+    STG    => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o,           } ],
+    LDS    => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o,           } ],
+    STS    => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o,           } ],
+    LDL    => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o,           } ],
+    STL    => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o,           } ],
+    LDC    => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o,            } ],
+    # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded).
+    ATOM   => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o,   } ],
+    ATOMS  => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o,  } ],
+    RED    => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o,                      } ],
+    CCTL   => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o,  } ], #TODO
+    CCTLL  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO
+    CCTLT  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO
+
+    #Surface Memory Instructions (haven't gotten to these yet..)
+    SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO
+    SULD   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o,   } ], #TODO
+    SURED  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o,  } ], #TODO
+    SUST   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o,   } ], #TODO
+
+    #Control Instructions
+    BRA    => [
+                { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?<U>\.U)? $i20w24;"o,         },
+                { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?<U>\.U)? CC\.EQ, $i20w24;"o, },
+              ],
+    BRX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o,                      } ], #TODO
+    JMP    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o,                      } ], #TODO
+    JMX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o,                      } ], #TODO
+    SSY    => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o,                 } ],
+    SYNC   => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o,                          } ],
+    CAL    => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o,                 } ],
+    JCAL   => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o,                } ],
+    PRET   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o,                     } ], #TODO
+    RET    => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o,                           } ],
+    BRK    => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o,                           } ],
+    PBK    => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o,                 } ],
+    CONT   => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o,                          } ],
+    PCNT   => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o,                } ],
+    EXIT   => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o,                          } ],
+    PEXIT  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o,                    } ], #TODO
+    BPT    => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o,           } ],
+
+    #Miscellaneous Instructions
+    NOP    => [ { type => $x32T,  code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o,                                     } ],
+    CS2R   => [ { type => $x32T,  code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o,                           } ],
+    S2R    => [ { type => $s2rT,  code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o,                            } ],
+    B2R    => [ { type => $x32T,  code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o,                                 } ],
+    BAR    => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o,                                 } ],
+    DEPBAR => [
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, },
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o,              },
+              ],
+    MEMBAR => [ { type => $x32T,  code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o,                             } ],
+    VOTE   => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?<nor0>))$p45, $p39;"o, } ],
+    R2B    => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o,                                } ], #TODO
+
+    #Video Instructions... Need to finish
+    VADD   => [   { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMAD   => [
+                  { type => $x32T,  code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, },
+                  { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, },
+              ],
+    VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMNMX    => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+
+    VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+);
+
+# Create map of capture groups to op code flags that need to be added (or removed)
+my @flags = grep /\S/, split "\n", q{;
+
+BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD
+0x0100000000000000 neg
+
+FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP
+0x0100000000000000 neg
+
+PSET, PSETP
+0x0000000000008000 p12not
+0x0000000100000000 p29not
+
+FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE
+0x0000040000000000 p39not
+
+IADD, IADD3, XMAD, LEA, IMNMX
+0x0000800000000000 CC
+
+IADD32I
+0x0010000000000000 CC
+
+LEA
+0x0000000000000000 X
+
+SHF
+0x0004000000000000 W
+0x0001000000000000 HI
+
+SHF: type
+0x0000004000000000 U64
+0x0000006000000000 S64
+
+SHR, IMNMX, ISETP, ISET, ICMP, BFE
+0x0001000000000000 U32
+
+SHL
+0x0000008000000000 W
+
+SHFL
+0x0000000010000000 i20w8
+0x0000000020000000 i34w13
+
+SHFL: mode
+0x0000000000000000 IDX
+0x0000000040000000 UP
+0x0000000080000000 DOWN
+0x00000000c0000000 BFLY
+
+IMNMX: mode
+0x0000080000000000 XLO
+0x0000180000000000 XHI
+
+ISETP, ISET, ICMP: cmp
+0x0002000000000000 LT
+0x0004000000000000 EQ
+0x0006000000000000 LE
+0x0008000000000000 GT
+0x000a000000000000 NE
+0x000c000000000000 GE
+
+ISETP, ISET, PSETP, PSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+PSETP, PSET: bool2
+0x0000000000000000 AND
+0x0000000001000000 OR
+0x0000000002000000 XOR
+
+ISETP, ISET
+0x0000080000000000 X
+
+LOP: bool
+0x0000000000000000 AND
+0x0000020000000000 OR
+0x0000040000000000 XOR
+0x0000060000000000 PASS_B
+
+LOP:
+0x0000010000000000 INV
+
+LOP: z
+0x0000200000000000 Z
+0x0000300000000000 NZ
+
+LOP
+0x0007000000000000 noz
+
+LOP32I: bool
+0x0000000000000000 AND
+0x0020000000000000 OR
+0x0040000000000000 XOR
+
+PRMT: mode
+0x0001000000000000 F4E
+0x0002000000000000 B4E
+0x0003000000000000 RC8
+0x0004000000000000 ECL
+0x0005000000000000 ECR
+0x0006000000000000 RC16
+
+XMAD: type1
+0x0000000000000000 U16
+0x0001000000000000 S16
+
+XMAD: type2
+0x0000000000000000 U16
+0x0002000000000000 S16
+
+XMAD: mode
+0x0000002000000000 MRG
+0x0000001000000000 PSL
+0x0008000000000000 CHI
+0x0004000000000000 CLO
+0x000c000000000000 CSFU
+
+XMAD: modec
+0x0004000000000000 CLO
+0x0008000000000000 CHI
+0x000c000000000000 CSFU
+0x0040000000000000 X
+0x0080000000000000 PSL
+0x0100000000000000 MRG
+
+XMAD
+0x0010000000000000 CBCC
+
+XMAD: r8part
+0x0000000000000000 H0
+0x0020000000000000 H1
+
+XMAD: r20part
+0x0000000000000000 H0
+0x0000000800000000 H1
+
+XMAD: r20partx
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+XMAD: r39part
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part
+0x0000000000000000 B0
+0x0000001000000000 B1
+0x0000002000000000 B2
+0x0000003000000000 B3
+0x0000001000000000 H1
+0x0000000000000000 H0
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part
+0x0000000000000000 B0
+0x0000000010000000 B1
+0x0000000020000000 B2
+0x0000000030000000 B3
+0x0000000010000000 H1
+0x0000000000000000 H0
+
+VMAD
+0x0040000000000000 r8neg
+0x0020000000000000 r39neg
+0x0008000000000000 SHR_7
+0x0010000000000000 SHR_15
+0x0060000000000000 PO
+0x0080000000000000 SAT
+
+VMNMX
+0x0100000000000000 MX
+
+VADD, VABSDIFF, VMNMX
+0x0080000000000000 SAT
+0x0040000000000000 UD
+0x0040000000000000 SD
+
+VSET: cmp
+0x0040000000000000 LT
+0x0080000000000000 EQ
+0x00c0000000000000 LE
+0x0100000000000000 GT
+0x0140000000000000 NE
+0x0180000000000000 GE
+
+VADD, VSET: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VABSDIFF: mode
+0x0003000000000000 ACC
+0x000b000000000000 MIN
+0x0013000000000000 MAX
+0x0023000000000000 MRG_16H
+0x002b000000000000 MRG_16L
+0x0033000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x003b000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMNMX: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1
+0x0000000000000000 U
+0x0001000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2
+0x0000000000000000 U
+0x0002000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size1
+0x0000000000000000 8
+0x0000004000000000 16
+0x0000006000000000 32
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size2
+0x0000000000000000 8
+0x0000000040000000 16
+0x0000000060000000 32
+
+IADD3: type
+0x0001000000000000 X
+0x0000002000000000 RS
+0x0000004000000000 LS
+
+IADD3: r8part
+0x0000000000000000 H0
+0x0000001000000000 H1
+
+IADD3: r20part
+0x0000000080000000 H0
+
+IADD3: r39part
+0x0000000200000000 H0
+
+IADD3
+0x0008000000000000 r8neg
+0x0004000000000000 r20neg
+0x0002000000000000 r39neg
+
+IADD
+0x0000080000000000 X
+0x0004000000000000 SAT
+
+IADD, ISCADD
+0x0002000000000000 r8neg
+0x0001000000000000 r20neg
+
+IADD32I
+0x0100000000000000 r8neg
+0x0020000000000000 X
+
+DEPBAR: SB
+0x0000000000000000 SB0
+0x0000000004000000 SB1
+0x0000000008000000 SB2
+0x000000000c000000 SB3
+0x0000000010000000 SB4
+0x0000000014000000 SB5
+
+DEPBAR: cmp
+0x0000000020000000 LE
+
+DEPBAR
+0x0000000000000001 db0
+0x0000000000000002 db1
+0x0000000000000004 db2
+0x0000000000000008 db3
+0x0000000000000010 db4
+0x0000000000000020 db5
+
+F2F, F2I, I2F, I2I: destWidth
+0x0000000000000000 8
+0x0000000000000100 16
+0x0000000000000200 32
+0x0000000000000300 64
+
+F2F, F2I, I2F, I2I: srcWidth
+0x0000000000000000 8
+0x0000000000000400 16
+0x0000000000000800 32
+0x0000000000000c00 64
+
+F2F, F2I, I2F, I2I: destSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000001000 S
+
+F2F, F2I, I2F, I2I: srcSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000002000 S
+
+F2I, I2F, I2I: r20part
+0x0000000000000000 H0
+0x0000040000000000 H1
+0x0000000000000000 B0
+0x0000020000000000 B1
+0x0000040000000000 B2
+0x0000060000000000 B3
+
+F2F: r20part
+0x0000000000000000 H0
+0x0000020000000000 H1
+
+F2F: round
+0x0000040000000000 ROUND
+0x0000048000000000 FLOOR
+0x0000050000000000 CEIL
+0x0000058000000000 TRUNC
+
+F2I: round
+0x0000000000000000 ROUND
+0x0000008000000000 FLOOR
+0x0000010000000000 CEIL
+0x0000018000000000 TRUNC
+
+HADD2, HMUL2: r8part
+0x0001000000000000 H0_H0
+0x0000000000000000 H1_H1
+
+HFMA2: r20part
+0x0000000020000000 H0_H0
+0x0000000030000000 H1_H1
+
+FADD, DADD, FMUL, DMUL, F2F, I2F: rnd
+0x0000000000000000 RN
+0x0000008000000000 RM
+0x0000010000000000 RP
+0x0000018000000000 RZ
+
+DFMA: rnd
+0x0000000000000000 RN
+0x0004000000000000 RM
+0x0008000000000000 RP
+0x000c000000000000 RZ
+
+FFMA: rnd
+0x0000000000000000 RN
+0x0008000000000000 RM
+0x0010000000000000 RP
+0x0018000000000000 RZ
+
+FFMA
+0x0020000000000000 FTZ
+
+F2F, F2I, FADD, FMUL, FMNMX
+0x0000100000000000 FTZ
+
+FADD32I
+0x0080000000000000 FTZ
+
+FMUL32I
+0x0020000000000000 FTZ
+
+FSET
+0x0080000000000000 FTZ
+
+FSETP, FCMP
+0x0000800000000000 FTZ
+
+HADD2, HMUL2
+0x0000008000000000 FTZ
+
+HFMA2
+0x0000002000000000 FTZ
+
+FADD, FFMA, FMUL, F2F, I2I
+0x0004000000000000 SAT
+
+FADD, DADD, FMNMX, DMNMX, MUFU
+0x0001000000000000 r8neg
+
+FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I
+0x0000200000000000 r20neg
+
+FMUL, DMUL, FFMA, DFMA
+0x0001000000000000 r20neg
+
+FFMA, DFMA
+0x0002000000000000 r39neg
+
+FADD, DADD, FMNMX, DMNMX
+0x0000400000000000 r8abs
+
+FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I
+0x0002000000000000 r20abs
+
+FSETP, DSETP, FSET, DSET
+0x0000080000000000 r8neg
+0x0000000000000040 r20neg
+0x0000000000000080 r8abs
+0x0000100000000000 r20abs
+
+RRO: func
+0x0000000000000000 SINCOS
+0x0000008000000000 EX2
+
+MUFU: func
+0x0000000000000000 COS
+0x0000000000100000 SIN
+0x0000000000200000 EX2
+0x0000000000300000 LG2
+0x0000000000400000 RCP
+0x0000000000500000 RSQ
+0x0000000000600000 RCP64H
+0x0000000000700000 RSQ64H
+
+FSETP, DSETP, FSET, DSET, FCMP: cmp
+0x0001000000000000 .LT
+0x0002000000000000 .EQ
+0x0003000000000000 .LE
+0x0004000000000000 .GT
+0x0004000000000000
+0x0005000000000000 .NE
+0x0006000000000000 .GE
+0x0007000000000000 .NUM
+0x0008000000000000 .NAN
+0x0009000000000000 .LTU
+0x000a000000000000 .EQU
+0x000b000000000000 .LEU
+0x000c000000000000 .GTU
+0x000d000000000000 .NEU
+0x000e000000000000 .GEU
+
+FSETP, DSETP, FSET, DSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+HSETP2: cmp
+0x0000002800000000 .NE
+
+HSETP2: bool
+0x0000000000000000 AND
+
+S2R: sr
+0x0000000000000000 LANEID
+0x0000000000200000 VIRTCFG
+0x0000000000300000 VIRTID
+0x0000000002100000 TID.X
+0x0000000002200000 TID.Y
+0x0000000002300000 TID.Z
+0x0000000002500000 CTAID.X
+0x0000000002600000 CTAID.Y
+0x0000000002700000 CTAID.Z
+0x0000000003800000 EQMASK
+0x0000000003900000 LTMASK
+0x0000000003a00000 LEMASK
+0x0000000003b00000 GTMASK
+0x0000000003c00000 GEMASK
+
+CS2R: sr
+0x0000000005000000 CLOCKLO
+0x0000000005100000 CLOCKHI
+0x0000000005200000 GLOBALTIMERLO
+0x0000000005300000 GLOBALTIMERHI
+
+B2R
+0x0000e00000000000 nop45
+
+BAR
+0x0000100000000000 i8w4
+0x0000080000000000 nor20
+0x0000038000000000 nop39
+
+BAR: mode
+0x0000000000000000 SYNC
+0x0000000100000000 ARV
+0x0000000200000000 RED
+
+BAR: red
+0x0000000000000000 POPC
+0x0000000800000000 AND
+0x0000001000000000 OR
+
+MEMBAR: mode
+0x0000000000000000 CTA
+0x0000000000000100 GL
+0x0000000000000200 SYS
+
+VOTE: mode
+0x0000000000000000 ALL
+0x0001000000000000 ANY
+0x0002000000000000 EQ
+
+VOTE
+0x00000000000000ff nor0
+
+BRA
+0x0000000000000080 U
+
+TLDS: chnls
+0x0010000000000000 RGBA
+
+TLDS
+0x0002000000000000 NODEP
+
+LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS
+0x000000000000ff00 nor8
+
+LD, ST: type
+0x0000000000000000 .U8
+0x0020000000000000 .S8
+0x0040000000000000 .U16
+0x0060000000000000 .S16
+0x0080000000000000
+0x0080000000000000 .32
+0x00a0000000000000 .64
+0x00c0000000000000 .128
+
+LD, ST: cache
+0x0100000000000000 CG
+0x0200000000000000 CS
+0x0300000000000000 CV
+0x0300000000000000 WT
+
+LDG, STG, LDS, STS, LDL, STL, LDC: type
+0x0000000000000000 .U8
+0x0001000000000000 .S8
+0x0002000000000000 .U16
+0x0003000000000000 .S16
+0x0004000000000000
+0x0004000000000000 .32
+0x0005000000000000 .64
+0x0006000000000000 .128
+
+LDG, STG: cache
+0x0000400000000000 CG
+0x0000800000000000 CI
+0x0000800000000000 CS
+0x0000c00000000000 CV
+0x0000c00000000000 WT
+
+LDL: cache
+0x0000200000000000 CI
+
+LDC: cache
+0x0000100000000000 IL
+
+LDG, STG, LDS, STS, LDL, STL, LDC
+0x0000200000000000 E
+
+LDS
+0x0000100000000000 U
+
+RED: type
+0x0000000000000000
+0x0000000000100000 .S32
+0x0000000000200000 .U64
+0x0000000000300000 .F32.FTZ.RN
+0x0000000000400000 .F16x2.FTZ.RN
+0x0000000000500000 .S64
+
+RED: mode
+0x0000000000000000 ADD
+0x0000000000800000 MIN
+0x0000000001000000 MAX
+0x0000000001800000 INC
+0x0000000002000000 DEC
+0x0000000002800000 AND
+0x0000000003000000 OR
+0x0000000003800000 XOR
+
+ATOM: type
+0x0000000000000000
+0x0002000000000000 .S32
+0x0004000000000000 .U64
+0x0006000000000000 .F32.FTZ.RN
+0x0008000000000000 .F16x2.FTZ.RN
+0x000a000000000000 .S64
+0x0002000000000000 .64
+
+ATOM, RED
+0x0001000000000000 E
+
+ATOM: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x03f0000000000000 CAS
+
+ATOMS: type
+0x0000000000000000
+0x0000000010000000 .S32
+0x0000000020000000 .U64
+0x0000000030000000 .S64
+0x0010000000000000 .64
+
+ATOMS: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x0240000000000000 CAS
+};
+
+# The existence of a capture group can map directly to an op code adjustment, or...
+# The named capture group value can map the op code adjustmemt from among several options
+our %flags;
+my (@ops, $flag);
+foreach my $line (@flags)
+{
+    if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)')
+    {
+        my $val = hex($1);
+        # named rules (op: name)
+        if ($flag)
+            { $flags{$_}{$flag}{$2} = $val foreach @ops; }
+        # simple existence check rules
+        else
+            { $flags{$_}{$2}        = $val foreach @ops; }
+    }
+    else
+    {
+        my ($ops, $name) = split ':\s*', $line;
+        @ops = split ',\s*', $ops;
+        $flag = $name;
+    }
+}
+
+sub parseInstruct
+{
+    my ($inst, $grammar) = @_;
+    return unless $inst =~ $grammar->{rule};
+    my %capData = %+;
+    return \%capData;
+}
+
+# for immediate or constant operands and a given opcode, bits 56-63 get transformed
+my %immedOps = map { $_ => 1 } qw(i20 f20 d20);
+my %immedCodes =
+(
+    0x5c => 0x64,
+    0x5b => 0x6d,
+    0x59 => 0x6b,
+    0x58 => 0x68,
+);
+my %constCodes =
+(
+    c20 => 0x10,
+    c39 => 0x08,
+);
+my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4);
+
+# just pick out the reuse code and nothing else
+sub genReuseCode
+{
+    my $capData = shift;
+    my $reuse = 0;
+    $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes;
+    return $reuse;
+}
+
+# Generate an op code from regex capture data
+# if you pass in a test array ref it will populate it with the matching capture groups
+sub genCode
+{
+    my ($op, $grammar, $capData, $test) = @_;
+
+    my $flags     = $flags{$op};
+    my $code      = $grammar->{code};
+    my $reuse     = 0;
+    my $immedCode = $immedCodes{$code >> 56};
+
+    #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I';
+
+    # process the instruction predicate (if valid for this instuction)
+    if (exists $capData->{noPred})
+    {
+        delete $capData->{noPred};
+        push @$test, 'noPred' if $test;
+    }
+    else
+    {
+        my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7;
+        push @$test, 'predNum' if $test;
+        if (exists $capData->{predNot})
+        {
+            $p |= 8;
+            push @$test, 'predNot' if $test;
+        }
+        $code ^= $p << 16;
+        delete @{$capData}{qw(predNum predNot)};
+
+    }
+    # process the register reuse flags
+    foreach my $rcode (qw(reuse1 reuse2 reuse3))
+    {
+        if (delete $capData->{$rcode})
+        {
+            $reuse |= $reuseCodes{$rcode};
+            push @$test, $rcode if $test;
+        }
+    }
+
+    foreach my $capture (keys %$capData)
+    {
+        # change the base code for immediate versions of the op
+        if (exists $immedOps{$capture})
+            { $code ^= $immedCode << 56; }
+        # change the base code for constant versions of the op
+        elsif (exists $constCodes{$capture})
+            { $code ^= $constCodes{$capture} << 56; }
+
+        # if capture group is an operand then process and add that data to code
+        if (exists $operands{$capture})
+        {
+            # don't process the r20 that comes with the r39s20 capture
+            unless ($capture eq 'r20' && exists $capData->{r39s20})
+            {
+                $code ^= $operands{$capture}->($capData->{$capture});
+                push @$test, $capture if $test;
+            }
+        }
+
+        # Add matching flags (an operand might also add/remove a flag)
+        if (exists $flags->{$capture})
+        {
+            # a named multivalue flag
+            if (ref $flags->{$capture})
+            {
+                $code ^= $flags->{$capture}{$capData->{$capture}};
+                push @$test, "$capture:$capData->{$capture}" if $test;
+            }
+            # a simple exists flag
+            else
+            {
+                $code ^= $flags->{$capture};
+                push @$test, $capture if $test;
+            }
+        }
+        elsif (!exists $operands{$capture} && !$test)
+        {
+            # Every capture group should be acted upon.  Missing one is a bug.
+            warn "UNUSED: $op: $capture: $capData->{$capture}\n";
+            warn Dumper($flags);
+        }
+    }
+
+    return $code, $reuse;
+}
+
+
+my $CtrlRe = qr'(?<ctrl>[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])';
+my $PredRe = qr'(?<pred>@!?(?<predReg>P\d)\s+)';
+my $InstRe = qr"$PredRe?(?<op>\w+)(?<rest>[^;]*;)"o;
+my $CommRe = qr'(?<comment>.*)';
+
+sub processAsmLine
+{
+    my ($line, $lineNum) = @_;
+
+    if ($line =~ m"^$CtrlRe(?<space>\s+)$InstRe$CommRe"o)
+    {
+        return {
+            lineNum => $lineNum,
+            pred    => $+{pred},
+            predReg => $+{predReg},
+            space   => $+{space},
+            op      => $+{op},
+            comment => $+{comment},
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            ctrl    => readCtrl($+{ctrl}, $line),
+        };
+    }
+    return undef;
+}
+
+sub processSassLine
+{
+    my $line = shift;
+
+    if ($line =~ m"^\s+/\*(?<num>[0-9a-f]+)\*/\s+$InstRe\s+/\* (?<code>0x[0-9a-f]+)"o)
+    {
+        return {
+            num     => hex($+{num}),
+            pred    => $+{pred},
+            op      => $+{op},
+            ins     => normalizeSpacing($+{op} . $+{rest}),
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            code    => hex($+{code}),
+        };
+    }
+    return undef;
+}
+
+sub processSassCtrlLine
+{
+    my ($line, $ctrl, $ruse) = @_;
+
+    return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)';
+
+    my $code = hex($1);
+    if (ref $ctrl)
+    {
+        push @$ctrl, ($code & 0x000000000001ffff) >> 0;
+        push @$ctrl, ($code & 0x0000003fffe00000) >> 21;
+        push @$ctrl, ($code & 0x07fffc0000000000) >> 42;
+    }
+    if (ref $ruse)
+    {
+        push @$ruse, ($code & 0x00000000001e0000) >> 17;
+        push @$ruse, ($code & 0x000003c000000000) >> 38;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+    }
+    return 1;
+}
+
+sub replaceXMADs
+{
+    my $file = shift;
+
+# XMAD.LO d, a, b, c, x;
+# ----------------------
+# XMAD.MRG x, a, b.H1, RZ;
+# XMAD d, a, b, c;
+# XMAD.PSL.CBCC d, a.H1, x.H1, d;
+# ----------------------
+# XMAD d, a, 0xffff, c;
+# XMAD.PSL d, a.H1, 0xffff, d;
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD\.LO\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<x>\w+)\s*;$CommRe/
+
+        die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s
+%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s;
+%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;',
+                @+{qw(ctrl space pred d a b c x comment)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>-?$immed|\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/
+
+        die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<c>\w+)\s*;$CommRe/
+
+        die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    #TODO: add more XMAD macros
+    return $file;
+}
+# convert extra spaces to single spacing to make our re's simplier
+sub normalizeSpacing
+{
+    my $inst = shift;
+    $inst =~ s/\t/ /g;
+    $inst =~ s/\s{2,}/ /g;
+    return $inst;
+}
+
+
+# map binary control notation on to easier to work with format.
+sub printCtrl
+{
+    my $code = shift;
+
+    my $stall = ($code & 0x0000f) >> 0;
+    my $yield = ($code & 0x00010) >> 4;
+    my $wrtdb = ($code & 0x000e0) >> 5;  # write dependency barier
+    my $readb = ($code & 0x00700) >> 8;  # read  dependency barier
+    my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier
+
+    $yield = $yield ? '-' : 'Y';
+    $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1;
+    $readb = $readb == 7 ? '-' : $readb + 1;
+    $watdb = $watdb ? sprintf('%02x', $watdb) : '--';
+
+    return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall;
+}
+sub readCtrl
+{
+    my ($ctrl, $context) = @_;
+    my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl;
+
+    $watdb = $watdb eq '--' ? 0 : hex $watdb;
+    $readb = $readb eq '-'  ? 7 : $readb - 1;
+    $wrtdb = $wrtdb eq '-'  ? 7 : $wrtdb - 1;
+    $yield = $yield eq 'y' || $yield eq 'Y'  ? 0 : 1;
+    $stall = hex $stall;
+
+    die sprintf('wait dep out of range(0x00-0x3f): %x at %s',   $watdb, $context) if $watdb != ($watdb & 0x3f);
+
+    return
+        $watdb << 11 |
+        $readb << 8  |
+        $wrtdb << 5  |
+        $yield << 4  |
+        $stall << 0;
+}
+
+sub getRegNum
+{
+    my ($regMap, $regName) = @_;
+
+    return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName};
+}
+
+sub getVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r0} or return;
+
+    return if $regName eq 'RZ';
+
+    if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+3);
+        }
+        confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4;
+        return @{$vectors->{$regName}};
+    }
+    return $regName;
+}
+
+sub getAddrVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r8} or return;
+
+    return if $regName eq 'RZ';
+
+    if (exists $capData->{E})
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        print Dumper($vectors) unless exists $vectors->{$regName};
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    return $regName;
+}
+
+__END__
+
+
+
diff --git a/Assembler/PascalAs/blib/lib/PascalAs/.exists b/Assembler/PascalAs/blib/lib/PascalAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/lib/PascalAs/Cubin.pm b/Assembler/PascalAs/blib/lib/PascalAs/Cubin.pm
new file mode 100644
index 0000000..10bf9a8
--- /dev/null
+++ b/Assembler/PascalAs/blib/lib/PascalAs/Cubin.pm
@@ -0,0 +1,686 @@
+package PascalAs::Cubin;
+
+use strict;
+use Data::Dumper;
+
+my @Elf32_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    L   entry
+    L   phOffset
+    L   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf64_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    Q   entry
+    Q   phOffset
+    Q   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf32_PrgHdr = qw(
+    L   type
+    L   offset
+    L   vaddr
+    L   paddr
+    L   fileSize
+    L   memSize
+    L   flags
+    L   align
+);
+my @Elf64_PrgHdr = qw(
+    L   type
+    L   flags
+    Q   offset
+    Q   vaddr
+    Q   paddr
+    Q   fileSize
+    Q   memSize
+    Q   align
+);
+my @Elf32_SecHdr = qw(
+    L   name
+    L   type
+    L   flags
+    L   addr
+    L   offset
+    L   size
+    L   link
+    L   info
+    L   align
+    L   entSize
+);
+my @Elf64_SecHdr = qw(
+    L   name
+    L   type
+    Q   flags
+    Q   addr
+    Q   offset
+    Q   size
+    L   link
+    L   info
+    Q   align
+    Q   entSize
+);
+my @Elf32_SymEnt = qw(
+    L   name
+    L   value
+    L   size
+    C   info
+    C   other
+    S   shIndx
+);
+my @Elf64_SymEnt = qw(
+    L   name
+    C   info
+    C   other
+    S   shIndx
+    Q   value
+    Q   size
+);
+my @symBind = qw(LOCAL GLOBAL WEAK);
+
+# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C)
+my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC);
+
+$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr;
+$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr;
+$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr;
+$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt;
+
+$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr;
+$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr;
+$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr;
+$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt;
+
+$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr    ];
+$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ];
+$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ];
+$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ];
+
+$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr    ];
+$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ];
+$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ];
+$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ];
+
+# Load a cubin ELF file
+sub new
+{
+    my ($package, $file) = @_;
+
+    my $cubin = bless { fileName => $file }, $package;
+
+    open my $fh, $file or die "$file: $!";
+    binmode($fh);
+
+    # Read in assuming 32 bit header
+    my $data;
+    read $fh, $data, 0x34;
+    my $elfHdr = $cubin->{elfHdr} = {};
+    @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data;
+
+    # 1: 32bit, 2: 64bit
+    my $class = $elfHdr->{fileClass};
+
+    # re-read in with 64 bit header if needed
+    if ($class == 2)
+    {
+        seek $fh, 0, 0;
+        read $fh, $data, 0x46;
+        @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data;
+
+        $cubin->{Class} = 64;
+    }
+    else
+    {
+        $cubin->{Class} = 32;
+    }
+
+    # verify sm_60 cubin
+    #$cubin->{Arch} = $elfHdr->{flags} & 0xFF;
+    #die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50;
+
+    $cubin->{Arch} = "60";
+    die "Cubin not in sm_60. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} != 60;
+    $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32;
+
+    # Read in Program Headers
+    seek $fh, $elfHdr->{phOffset}, 0;
+    foreach (1 .. $elfHdr->{phNum})
+    {
+        read $fh, $data, $elfHdr->{phEntSize};
+
+        my %prgHdr = (Indx => $_ - 1);
+        @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data;
+        push @{$cubin->{prgHdrs}}, \%prgHdr;
+    }
+
+    # Read in Section Headers
+    seek $fh, $elfHdr->{shOffset}, 0;
+    foreach (1 .. $elfHdr->{shNum})
+    {
+        read $fh, $data, $elfHdr->{shEntSize};
+
+        my %secHdr = (Indx => $_ - 1);
+        @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data;
+        push @{$cubin->{secHdrs}}, \%secHdr;
+    }
+
+    # Read in Section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $data = '';
+        # Skip sections with no data (type NULL or NOBITS)
+        if ($secHdr->{size} && $secHdr->{type} != 8)
+        {
+            seek $fh, $secHdr->{offset}, 0;
+            read $fh, $data, $secHdr->{size};
+        }
+        # Convert string tables to maps
+        if ($secHdr->{type} == 3) # STRTAB
+        {
+            my $strTab = $secHdr->{StrTab} = {};
+            my $indx   = 0;
+            foreach my $str (split "\0", $data)
+            {
+                $strTab->{$indx} = $str;
+                $indx += 1 + length($str);
+            }
+        }
+        # Read in Symbol data
+        if ($secHdr->{type} == 2) # SYMTAB
+        {
+            my $offset = 0;
+            while ($offset < $secHdr->{size})
+            {
+                my $symEnt = {};
+                @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize});
+                $offset += $secHdr->{entSize};
+
+                push @{$secHdr->{SymTab}}, $symEnt;
+            }
+        }
+        # Cache raw data for further processing and writing
+        $secHdr->{Data} = unpack 'H*', $data;
+    }
+    close $fh;
+
+    # Update section headers with their names.  Map names directly to headers.
+    my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab};
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $secHdr->{Name} = $shStrTab->{$secHdr->{name}};
+        $cubin->{$secHdr->{Name}} = $secHdr;
+    }
+
+    # Update symbols with their names
+    # For the Global functions, extract kernel meta data
+    # Populate the kernel hash
+    my $strTab = $cubin->{'.strtab'}{StrTab};
+    foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}})
+    {
+        $symEnt->{Name} = $strTab->{$symEnt->{name}};
+
+        # Attach symbol to section
+        my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}];
+        $secHdr->{SymbolEnt} = $symEnt;
+
+        # Look for symbols tagged FUNC
+        if (($symEnt->{info} & 0x0f) == 0x02)
+        {
+            # Create a hash of kernels for output
+            my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr;
+
+            # Extract local/global/weak binding info
+            $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4];
+
+            # Extract the kernel instructions
+            $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ];
+
+            # Extract the max barrier resource identifier used and add 1. Should be 0-16.
+            # If a register is used as a barrier resource id, then this value is the max of 16.
+            $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20;
+
+            # Extract the number of allocated registers for this kernel.
+            $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24;
+
+            # Extract the size of shared memory this kernel uses.
+            my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"};
+            $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0;
+
+            # Attach constant0 section
+            $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"};
+
+            # Extract the kernel parameter data.
+            my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"};
+            if ($paramSec)
+            {
+                # Extract raw param data
+                my @data = unpack "L*", pack "H*", $paramSec->{Data};
+
+                $paramSec->{ParamData} = \@data;
+                $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ];
+
+                # Find the first param delimiter
+                my $idx = 0;
+                $idx++ while $idx < @data && $data[$idx] != 0x00080a04;
+
+                my $first = $data[$idx+2] & 0xFFFF;
+                #my $size  = $data[$idx+2] >> 16;
+                $idx += 4;
+
+                my @params;
+                while ($idx < @data && $data[$idx] == 0x000c1704)
+                {
+                    # Get the ordinal, offset, size and pointer alignment for each param
+                    my $ord    = $data[$idx+2] & 0xFFFF;
+                    my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16);
+                    my $psize  = $data[$idx+3] >> 18;
+                    my $align  = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0;
+                    unshift @params, "$ord:$offset:$psize:$align";
+                    $idx += 4;
+                }
+                my @staticParams = @data[0 .. ($idx-1)];
+
+                my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize);
+                while ($idx < @data)
+                {
+                    my $code = $data[$idx] & 0xffff;
+                    my $size = $data[$idx] >> 16;
+                    $idx++;
+
+                    # EIATTR_MAXREG_COUNT
+                    if ($code == 0x1b03)
+                    {
+                        $maxregCount = $size;
+                    }
+                    # EIATTR_S2RCTAID_INSTR_OFFSETS
+                    elsif ($code == 0x1d04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @ctaidOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_EXIT_INSTR_OFFSETS
+                    elsif ($code == 0x1c04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @exitOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CTAIDZ_USED
+                    elsif ($code == 0x0401)
+                    {
+                        $ctaidzUsed = 1;
+                    }
+                    # EIATTR_REQNTID
+                    elsif ($code == 0x1004)
+                    {
+                        while ($size > 0)
+                        {
+                            push @reqntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_MAX_THREADS
+                    elsif ($code == 0x0504)
+                    {
+                        while ($size > 0)
+                        {
+                            push @maxntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CRS_STACK_SIZE
+                    elsif ($code == 0x1e04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @stackSize, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    else
+                    {
+                        printf "Unknown Code 0x%02x (size:%d)\n", $code, $size;
+                    }
+                }
+                $kernelSec->{Params}   = \@params;
+                $kernelSec->{ParamCnt} = scalar @params;
+
+                $paramSec->{StaticParams} = \@staticParams;
+                $paramSec->{MAXREG_COUNT} = $maxregCount;
+                $paramSec->{ExitOffsets}  = \@exitOffsets;
+                $paramSec->{CTAIDOffsets} = \@ctaidOffsets;
+                $paramSec->{CTAIDZUsed}   = $ctaidzUsed;
+                $paramSec->{REQNTID}      = \@reqntid;
+                $paramSec->{MAXNTID}      = \@maxntid;
+                $paramSec->{STACKSIZE}    = \@stackSize;
+            }
+            # print Dumper($paramSec);
+            # exit();
+        }
+        # Note GLOBALs found in this cubin
+        elsif (($symEnt->{info} & 0x10) == 0x10)
+        {
+            $cubin->{Symbols}{$symEnt->{Name}} = $symEnt;
+        }
+    }
+
+    # print "phOffset: $elfHdr->{phOffset}\n";
+    # print "shOffset: $elfHdr->{shOffset}\n";
+    # foreach my $secHdr (@{$cubin->{secHdrs}})
+    # {
+    #     print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n";
+    # }
+    # my $p = 0;
+    # foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    # {
+    #     print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n";
+    #     $p++;
+    # }
+    # exit();
+
+    # print Dumper($cubin->{prgHdrs});
+    # exit();
+    return $cubin;
+}
+sub class
+{
+    return shift()->{Class};
+}
+sub arch
+{
+    return shift()->{Arch};
+}
+sub address_size
+{
+    return shift()->{AddressSize};
+}
+sub listKernels
+{
+    return shift()->{Kernels};
+}
+sub listSymbols
+{
+    return shift()->{Symbols};
+}
+sub getKernel
+{
+    my ($cubin, $kernel) = @_;
+    return $cubin->{Kernels}{$kernel};
+}
+
+sub modifyKernel
+{
+    my ($cubin, %params) = @_;
+
+    my $kernelSec    = $params{Kernel};
+    my $newReg       = $params{RegCnt};
+    my $newBar       = $params{BarCnt};
+    my $exitOffsets  = $params{ExitOffsets};
+    my $ctaidOffsets = $params{CTAIDOffsets};
+    my $ctaidzUsed   = $params{CTAIDZUsed};
+    my $newData      = $params{KernelData};
+    my $newSize      = @$newData * 8;
+
+    die "255 register max" if $newReg > 255;
+    die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63;
+    die "16 is max barrier count" if $newBar > 16;
+
+    my $paramSec    = $kernelSec->{ParamSec};
+    my $kernelName  = $kernelSec->{SymbolEnt}{Name};
+    my $maxregCount = $paramSec->{MAXREG_COUNT};
+    my $stackSize   = $paramSec->{STACKSIZE};
+
+    # update the kernel
+    $kernelSec->{KernelData} = $newData;
+    $kernelSec->{Data}       = unpack "H*", pack "Q*", @$newData;
+
+    if ($newReg != $kernelSec->{RegCnt})
+    {
+        print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n";
+        $kernelSec->{RegCnt} = $newReg;
+        $kernelSec->{info}  &= ~0xff000000;
+        $kernelSec->{info}  |= $newReg << 24;
+    }
+    if ($newBar != $kernelSec->{BarCnt})
+    {
+        print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n";
+        $kernelSec->{BarCnt} = $newBar;
+        $kernelSec->{flags} &= ~0x01f00000;
+        $kernelSec->{flags} |=  $newBar << 20;
+    }
+
+    my @paramData = @{$paramSec->{StaticParams}};
+
+    if (defined $maxregCount)
+    {
+        push @paramData, ($maxregCount << 16) | 0x1b03;
+    }
+
+    my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets;
+    my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}};
+
+    if ($newCTAIDs ne $oldCTAIDs)
+    {
+        print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n";
+    }
+    if (@$ctaidOffsets)
+    {
+        push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04;
+        push @paramData, @$ctaidOffsets;
+    }
+
+    my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets;
+    my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}};
+
+    if ($newExits ne $oldExits)
+    {
+        print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n";
+    }
+    if (@$exitOffsets)
+    {
+        push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04;
+        push @paramData, @$exitOffsets;
+    }
+
+    if ($ctaidzUsed != $paramSec->{CTAIDZUsed})
+    {
+        print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n";
+    }
+    if ($ctaidzUsed)
+    {
+        push @paramData, 0x0401;
+    }
+
+    if (@{$paramSec->{REQNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004;
+        push @paramData, @{$paramSec->{REQNTID}};
+    }
+    if (@{$paramSec->{MAXNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504;
+        push @paramData, @{$paramSec->{MAXNTID}};
+    }
+
+    if (@$stackSize)
+    {
+        push @paramData, (scalar(@$stackSize) << 18) | 0x1e04;
+        push @paramData, @$stackSize;
+    }
+
+    my $newParamSize  = scalar(@paramData)*4;
+    $paramSec->{Data} = unpack "H*", pack "L*", @paramData;
+    if ($newParamSize != $paramSec->{size})
+    {
+        print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n";
+        $cubin->updateSize($paramSec, $newParamSize);
+    }
+
+    if ($newSize != $kernelSec->{size})
+    {
+        print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n";
+        $cubin->updateSize($kernelSec, $newSize, 1);
+    }
+}
+
+sub updateSize
+{
+    my ($cubin, $sec, $newSize, $updatePrgSize) = @_;
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # update section header
+    my $delta = $newSize - $sec->{size};
+    $sec->{size} = $newSize;
+
+    # update symtab section
+    if ($sec->{SymbolEnt})
+    {
+        $sec->{SymbolEnt}{size} = $newSize;
+        my $symSection = $cubin->{'.symtab'};
+        $symSection->{Data} = '';
+        foreach my $symEnt (@{$symSection->{SymTab}})
+        {
+            $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}};
+        }
+    }
+
+    my $pos = $elfHdr->{ehSize};
+    my %sizeMap;
+
+    # update section header offsets
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # skip first header
+        next if $secHdr->{align} == 0;
+
+        # NOBITS data sections are size 0
+        my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size};
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pos += $secHdr->{align} - $pad;
+        }
+        # map old offset to new
+        $sizeMap{$secHdr->{offset}} = $pos;
+
+        # update offset
+        $secHdr->{offset} = $pos;
+
+        # advance position by size
+        $pos += $size;
+    }
+
+    # compute total section header size
+    my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset};
+
+    # map old offset to new
+    $sizeMap{$elfHdr->{shOffset}} = $pos;
+    $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize;
+
+    $elfHdr->{shOffset} = $pos;
+    $elfHdr->{phOffset} = $pos + $shSize;
+
+    # update program header offsets and sizes
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        # Not sure how best to adjust these so just assume they'll track other offsets.
+        $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}};
+
+        # If the kernel sizes changes, also update the associated ProgramHeader.
+        # Note that this size is the kernel size plus any constant section sizes.
+        if ($updatePrgSize && $prgHdr->{type} == 1 &&
+            $sec->{offset} >= $prgHdr->{offset} &&
+            $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta)
+        {
+            $prgHdr->{fileSize} += $delta;
+            $prgHdr->{memSize}  += $delta;
+        }
+    }
+}
+
+# Write out the cubin after modifying it.
+sub write
+{
+    my ($cubin, $file) = @_;
+
+    open my $fh, ">$file" or die "Error: could not open $file for writing: $!";
+    binmode($fh);
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # write elf header
+    print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}};
+    my $pos = $elfHdr->{ehSize};
+
+    # write section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # Skip NULL and NOBITS data sections
+        next if $secHdr->{size} == 0 || $secHdr->{type} == 8;
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pad = $secHdr->{align} - $pad;
+            print $fh join '', "\0" x $pad;
+            $pos += $pad;
+        }
+
+        print $fh pack 'H*', $secHdr->{Data};
+        $pos += $secHdr->{size};
+    }
+
+    # write section headers
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}};
+    }
+
+    #write program headers
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}};
+    }
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/PascalAs/blib/lib/PascalAs/PascalAs.pm b/Assembler/PascalAs/blib/lib/PascalAs/PascalAs.pm
new file mode 100644
index 0000000..eefcdf6
--- /dev/null
+++ b/Assembler/PascalAs/blib/lib/PascalAs/PascalAs.pm
@@ -0,0 +1,1407 @@
+package PascalAs::PascalAs;
+
+require 5.10.0;
+
+use strict;
+use Data::Dumper;
+use PascalAs::PascalAsGrammar;
+use File::Spec;
+use Carp;
+
+our $VERSION = '1.06';
+
+# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+my %relOffset  = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT);
+
+# these ops use absolute addresses
+my %absOffset  = map { $_ => 1 } qw(JCAL);
+
+my %jumpOp     = (%relOffset, %absOffset);
+
+# These instructions use r0 but do not write to r0
+my %noDest     = map { $_ => 1 } qw(ST STG STS STL RED);
+
+# Map register slots to reuse control codes
+my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4);
+
+# Preprocess and Assemble a source file
+sub Assemble
+{
+    my ($file, $include, $doReuse, $nowarn) = @_;
+
+    my $regMap = {};
+    $file = Preprocess($file, $include, 0, $regMap);
+    my $vectors = delete $regMap->{__vectors};
+    my $regBank = delete $regMap->{__regbank};
+
+    # initialize cubin counts
+    my $regCnt = 0;
+    my $barCnt = 0;
+
+    my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse);
+
+    # initialize the first control instruction
+    push @instructs, $ctrl = {};
+
+    foreach my $line (split "\n", $file)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        next unless preProcessLine($line);
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # Save us from crashing the display driver
+            die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n"
+                if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0;
+
+            # track branches/jumps/calls/etc for label remapping
+            push @branches, @instructs+0 if exists $jumpOp{$inst->{op}};
+
+            # push the control code onto the control instruction
+            push @{$ctrl->{ctrl}}, $inst->{ctrl};
+
+            # now point the instruction to its associated control instruction
+            $inst->{ctrl} = $ctrl;
+
+            # add the op name and full instruction text
+            push @instructs, $inst;
+
+            # add a 4th control instruction for every 3 instructions
+            push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            # map the label name to the index of the instruction about to be inserted
+            $labels{$1} = @instructs+0;
+        }
+        else
+        {
+            die "badly formed line at $lineNum: $line\n";
+        }
+    }
+    # add the final BRA op and align the number of instructions to a multiple of 8
+    push @{$ctrl->{ctrl}}, 0x007ff;
+    push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' };
+    while (@instructs & 7)
+    {
+        push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        push @{$ctrl->{ctrl}}, 0x007e0;
+        push @instructs, { op => 'NOP', inst => 'NOP;' };
+    }
+
+    # remap labels
+    foreach my $i (@branches)
+    {
+        if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1})
+            { die "instruction has invalid label: $instructs[$i]{inst}"; }
+
+        $instructs[$i]{jump} = $labels{$1};
+
+        if (exists $relOffset{$instructs[$i]{op}})
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; }
+        else
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; }
+    }
+
+    # calculate optimal register reuse
+    # This effects register bank decisions so do it before analyzing register use
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            if ($doReuse)
+            {
+                # get any vector registers for r0
+                my @r0 = getVecRegisters($vectors, $capData);
+
+                # There are 2 reuse slots per register slot
+                # The reuse hash points to most recent instruction index where register was last used in this slot
+
+                # For writes to a register, clear any reuse opportunity
+                if (@r0 && !exists $noDest{$op})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        if (my $reuse = $reuse{$slot})
+                        {
+                            # if writing with a vector op, clear all linked registers
+                            delete $reuse->{$_} foreach @r0;
+                        }
+                    }
+                }
+                # clear cache if jumping elsewhere
+                %reuse = () if exists $jumpOp{$op};
+
+                # only track register reuse for instruction types this works with
+                if ($gram->{type}{reuse})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        next unless exists $capData->{$slot};
+
+                        my $r = $capData->{$slot};
+                        next if $r eq 'RZ';
+                        next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction
+
+                        my $reuse = $reuse{$slot} ||= {};
+
+                        # if this register was previously marked for potential reuse
+                        if (my $p = $reuse->{$r})
+                        {
+                            # flag the previous instruction's ctrl reuse array slot
+                            $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot};
+
+                            #print "reuse $slot $r $instructs[$p]{inst}\n";
+                        }
+                        # list full, delete the oldest
+                        elsif (keys %$reuse > 2)
+                        {
+                            my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0];
+                            delete $reuse->{$oldest};
+                        }
+                        # mark the new instruction for potential reuse
+                        $reuse->{$r} = $i;
+                    }
+                }
+            }
+            # if reuse is disabled then pull value from code.
+            elsif ($gram->{type}{reuse})
+            {
+                $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData);
+            }
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # Assign registers to requested banks if possible
+    foreach my $r (sort keys %$regBank)
+    {
+        my $bank  = $regBank->{$r};
+        my $avail = $regMap->{$r};
+        foreach my $pos (0 .. $#$avail)
+        {
+            if ($bank == ($avail->[$pos] & 3))
+            {
+                # assign it, while removing the assigned register from the pool
+                $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+                last;
+            }
+        }
+    }
+
+    # calculate register live times and preferred banks for non-fixed registers.
+    # LiveTime only half implemented...
+    my (%liveTime, %pairedBanks, %reuseHistory);
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData   = parseInstruct($inst, $gram) or next;
+            my $reuseType = $gram->{type}{reuse};
+
+            # liveTimes and bank conflicts with source operands
+            my (%addReuse, %delReuse);
+            foreach my $slot (qw(r8 r20 r39))
+            {
+                my $r = $capData->{$slot} or next;
+                next if $r eq 'RZ';
+
+                my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r};
+
+                # All registers should be written prior to being read..
+                if (my $liveTime = $liveTime{$liveR})
+                {
+                    # for each read set the current instruction index as the high value
+                    $liveTime->[$#$liveTime][1] = $i;
+                    push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                }
+                else
+                {
+                    warn "register used without initialization ($r): $inst\n" unless $nowarn;
+                    push @{$liveTime{$liveR}}, [$i,$i];
+                }
+
+                # Is this register active in the reuse cache?
+                my $slotHist  = $reuseHistory{$slot} ||= {};
+                my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0;
+
+                #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3';
+
+                # If this is an auto reg, look at the open banks.
+                # No need to look at banks if this register is in the reuse cache.
+                if (!$selfReuse && ref $regMap->{$r})
+                {
+                    # Look at other source operands in this instruction and flag what banks are being used
+                    foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39))
+                    {
+                        my $r2 = $capData->{$slot2};
+                        next if $r2 eq 'RZ' || $r2 eq $r;
+
+                        my $slotHist2 = $reuseHistory{$slot2} ||= {};
+
+                        #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3';
+
+                        # Dont be concerned with non-reuse type instructions or
+                        # If this operand is in the reuse cache, we don't care what bank it's on.
+                        if (!$reuseType || !exists $slotHist2->{$r2})
+                        {
+                            # if the operand is also an auto-allocated register then link them
+                            # Once we choose the bank for one we want to update that choice for the other register.
+                            if (ref $regMap->{$r2})
+                            {
+                                push @{$pairedBanks{$r}{pairs}}, $r2;
+                                $pairedBanks{$r}{banks} ||= [];
+                            }
+                            # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid.
+                            else
+                            {
+                                my $bank = substr($regMap->{$r2},1) & 3;
+                                #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3';
+
+                                $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++;
+                                $pairedBanks{$r}{pairs} ||= [];
+                            }
+                            # Update the total use count for this register.
+                            # This will be the number of times the register is pulled out of the bank.
+                            $pairedBanks{$r}{useCnt}++;
+                        }
+                    }
+                }
+                # update the reuse history so we know which bank conflicts we can ignore.
+                if ($reuseType)
+                {
+                    # flag these slots for addition or removal from reuseHistory
+                    if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot})
+                        { $addReuse{$slot} = $r; }
+                    else
+                        { $delReuse{$slot} = $r; }
+                }
+            }
+            # update reuse history after we're done with the instruction (when the flag is actually in effect).
+            # we don't want to updated it in the middle since that can interfere with the checks,
+            $reuseHistory{$_}{$addReuse{$_}} = 1    foreach keys %addReuse;
+            delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse;
+
+            # liveTimes for destination operands and vector registers
+            foreach my $r0 (getVecRegisters($vectors, $capData))
+            {
+                # fixed register mappings can have aliases so use the actual register value for those.
+                my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0};
+
+                # If not writing treat just like a read
+                if (exists $noDest{$op})
+                {
+                    if (my $liveTime = $liveTime{$liveR})
+                    {
+                        $liveTime->[$#$liveTime][1] = $i;
+                        push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                    }
+                    else
+                    {
+                        warn "register used without initialization ($r0): $inst\n" unless $nowarn;
+                        push @{$liveTime{$liveR}}, [$i,$i];
+                    }
+                }
+                # If writing, push a new bracket on this register's stack.
+                elsif (my $liveTime = $liveTime{$liveR})
+                {
+                    if ($i > $liveTime->[$#$liveTime][1])
+                    {
+                        push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                    }
+                }
+                else
+                {
+                    # Initialize the liveTime stack for this register.
+                    push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                }
+            }
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+    #print Dumper(\%liveTime); exit(1);
+
+    # assign unassigned registers
+    # sort by most restricted, then most used, then name
+    foreach my $r (sort {
+                    $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} ||
+                    $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} ||
+                    $a cmp $b
+                  } keys %pairedBanks)
+    {
+        my $banks = $pairedBanks{$r}{banks};
+        my $avail = $regMap->{$r};
+
+        #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail;
+
+        # Pick a bank with zero or the smallest number of conflicts
+        BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3))
+        {
+            # pick an available register that matches the requested bank
+            foreach my $pos (0 .. $#$avail)
+            {
+                if ($bank == ($avail->[$pos] & 3))
+                {
+                    # assign it, while removing the assigned register from the pool
+                    $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+
+                    # update bank info for any unassigned pair
+                    $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}};
+                    last BANK;
+                }
+            }
+        }
+    }
+    # Now assign any remaining to first available
+    foreach my $r (sort keys %$regMap)
+    {
+        if (ref($regMap->{$r}) eq 'ARRAY')
+        {
+            $regMap->{$r} = 'R' . shift @{$regMap->{$r}};
+        }
+    }
+    #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap;
+
+    # apply the register mapping and assemble the instructions to op codes
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        # save the original and replace the register names with numbers
+        $instructs[$i]{orig} = $instructs[$i]{inst};
+        $instructs[$i]{inst} =~ s/(?<!\.)\b(\w+)\b(?!\[)/ exists($regMap->{$1}) ? $regMap->{$1} : $1 /ge;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            # update the register count
+            foreach my $r (qw(r0 r8 r20 r39))
+            {
+                next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ';
+
+                # get numeric portion of regname
+                my $val = substr $capData->{$r}, 1;
+
+                my @r0 = getVecRegisters($vectors, $capData);
+                my @r8 = getAddrVecRegisters($vectors, $capData);
+
+                # smart enough to count vector registers for memory instructions.
+                my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1;
+                my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1;
+
+                if ($val + $regInc > $regCnt)
+                {
+                    $regCnt = $val + $regInc;
+                    #print "$val $regCnt $regInc\n";
+                }
+            }
+            # update the barrier resource count
+            if ($op eq 'BAR')
+            {
+                if (exists $capData->{i8w4})
+                {
+                    $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt;
+                }
+                # if a barrier value is a register, assume the maximum
+                elsif (exists $capData->{r8})
+                {
+                    $barCnt = 16;
+                }
+            }
+            # Generate the op code.
+            my ($code, $reuse) = genCode($op, $gram, $capData);
+            $instructs[$i]{code} = $code;
+
+            # cache this for final pass when we want to calculate reuse stats.
+            if ($gram->{type}{reuse})
+                { $instructs[$i]{caps} = $capData; }
+            # use the parsed value of reuse for non-reuse type instructions
+            else
+                { $ctrl->{reuse}[($i & 3) - 1] = $reuse; }
+
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # final pass to piece together control codes
+    my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed);
+    foreach my $i (0 .. $#instructs)
+    {
+        # op code
+        if ($i & 3)
+        {
+            push @codes, $instructs[$i]{code};
+
+            if ($instructs[$i]{caps})
+            {
+                # calculate stats on registers
+                registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn);
+            }
+            if ($instructs[$i]{inst} =~ m'EXIT')
+            {
+                push @exitOffsets, (scalar(@codes)-1)*8;
+            }
+            elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)')
+            {
+                push @ctaidOffsets, (scalar(@codes)-1)*8;
+                $ctaidzUsed = 1 if $1 eq 'Z';
+            }
+        }
+        # control code
+        else
+        {
+            my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)};
+            push @codes,
+                ($ctrl->[0] <<  0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes
+                ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59);  # reuse codes
+        }
+    }
+
+    # return the kernel data
+    return {
+        RegCnt       => $regCnt,
+        BarCnt       => $barCnt,
+        ExitOffsets  => \@exitOffsets,
+        CTAIDOffsets => \@ctaidOffsets,
+        CTAIDZUsed   => $ctaidzUsed,
+        ConflictCnt  => $reuseHistory{conflicts},
+        ReuseCnt     => $reuseHistory{reuse},
+        ReuseTot     => $reuseHistory{total},
+        ReusePct     => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0),
+        KernelData   => \@codes,
+    };
+}
+
+# Useful for testing op code coverage of existing code, extracting new codes and flags
+sub Test
+{
+    my ($fh, $printConflicts, $all) = @_;
+
+    my @instructs;
+    my %reuseHistory;
+    my ($pass, $fail) = (0,0);
+
+    while (my $line = <$fh>)
+    {
+        my (@ctrl, @reuse);
+
+        next unless processSassCtrlLine($line, \@ctrl, \@reuse);
+
+        foreach my $fileReuse (@reuse)
+        {
+            $line = <$fh>;
+
+            my $inst = processSassLine($line) or next;
+
+            $inst->{reuse} = $fileReuse;
+            my $fileCode = $inst->{code};
+
+            if (exists $relOffset{$inst->{op}})
+            {
+                # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+                $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e;
+            }
+
+            my $match = 0;
+            foreach my $gram (@{$grammar{$inst->{op}}})
+            {
+                my $capData = parseInstruct($inst->{inst}, $gram) or next;
+                my @caps;
+
+                # Run in test mode to list what capture groups were captured
+                my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps);
+
+                # Detect register bank conflicts but only for reuse type instructions.
+                # If a bank conflict is avoided by a reuse flag then ignore it.
+                registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse};
+
+                $inst->{caps}      = join ', ', sort @caps;
+                $inst->{codeDiff}  = $fileCode  ^ $code;
+                $inst->{reuseDiff} = $fileReuse ^ $reuse;
+
+                # compare calculated and file values
+                if ($code == $fileCode && $reuse == $fileReuse)
+                {
+                    $inst->{grade} = 'PASS';
+                    push @instructs, $inst if $all;
+                    $pass++;
+                }
+                else
+                {
+                    $inst->{grade} = 'FAIL';
+                    push @instructs, $inst;
+                    $fail++;
+                }
+                $match = 1;
+                last;
+            }
+            unless ($match)
+            {
+                $inst->{grade}     = 'FAIL';
+                $inst->{codeDiff}  = $fileCode;
+                $inst->{reuseDiff} = $fileReuse;
+                push @instructs, $inst;
+                $fail++;
+            }
+        }
+    }
+    my %maxLen;
+    foreach (@instructs)
+    {
+        $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}};
+    }
+    my ($lastOp, $template);
+    foreach my $inst (sort {
+        $a->{op}        cmp $b->{op}        ||
+        $a->{codeDiff}  <=> $b->{codeDiff}  ||
+        $a->{reuseDiff} <=> $b->{reuseDiff} ||
+        $a->{ins}       cmp $b->{ins}
+        } @instructs)
+    {
+        if ($lastOp ne $inst->{op})
+        {
+            $lastOp   = $inst->{op};
+            $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s   %s\n";
+            printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s   %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures);
+        }
+        printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)};
+    }
+    my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0;
+
+    printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n",
+        $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total};
+
+    return $fail;
+}
+
+# Convert cuobjdump sass to the working format
+sub Extract
+{
+    my ($in, $out, $params) = @_;
+
+    my %paramMap;
+    my %constants =
+    (
+        blockDimX => 'c[0x0][0x8]',
+        blockDimY => 'c[0x0][0xc]',
+        blockDimZ => 'c[0x0][0x10]',
+        gridDimX  => 'c[0x0][0x14]',
+        gridDimY  => 'c[0x0][0x18]',
+        gridDimZ  => 'c[0x0][0x1c]',
+    );
+    print $out "<CONSTANT_MAPPING>\n";
+
+    foreach my $const (sort keys %constants)
+    {
+        print $out "    $const : $constants{$const}\n";
+        $paramMap{$constants{$const}} = $const;
+    }
+    print $out "\n";
+
+    foreach my $p (@$params)
+    {
+        my ($ord,$offset,$size,$align) = split ':', $p;
+
+        if ($size > 4)
+        {
+            my $num = 0;
+            $offset = hex $offset;
+            while ($size > 0)
+            {
+                my $param = sprintf 'param_%d[%d]', $ord, $num;
+                my $const = sprintf 'c[0x0][0x%x]', $offset;
+                $paramMap{$const} = $param;
+                print $out "    $param : $const\n";
+                $size   -= 4;
+                $offset += 4;
+                $num    += 1;
+            }
+        }
+        else
+        {
+            my $param = sprintf 'param_%d', $ord;
+            my $const = sprintf 'c[0x0][%s]', $offset;
+            $paramMap{$const} = $param;
+            print $out "    $param : $const\n";
+        }
+    }
+    print $out "</CONSTANT_MAPPING>\n\n";
+
+    my %labels;
+    my $labelnum = 1;
+
+    my @data;
+    FILE: while (my $line = <$in>)
+    {
+        my (@ctrl, @ruse);
+        next unless processSassCtrlLine($line, \@ctrl, \@ruse);
+
+        CTRL: foreach my $ctrl (@ctrl)
+        {
+            $line = <$in>;
+
+            my $inst = processSassLine($line) or next CTRL;
+
+            # Convert branch/jump/call addresses to labels
+            if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)')
+            {
+                my $target = hex($1);
+
+                # skip the final BRA and stop processing the file
+                last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8);
+
+                # check to see if we've already generated a label for this target address
+                my $label = $labels{$target};
+                unless ($label)
+                {
+                    # generate a label name and cache it
+                    $label = $labels{$target} = "TARGET$labelnum";
+                    $labelnum++;
+                }
+                # replace address with name
+                $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/;
+            }
+            $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg;
+
+            $inst->{ctrl} = printCtrl($ctrl);
+
+            push @data, $inst;
+        }
+    }
+    # make a second pass now that we have the complete instruction address to label mapping
+    foreach my $inst (@data)
+    {
+        print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}};
+        printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)};
+    }
+}
+
+my $CommentRe  = qr'^[\t ]*<COMMENT>.*?^\s*</COMMENT>\n?'ms;
+my $IncludeRe  = qr'^[\t ]*<INCLUDE\s+file="([^"]+)"\s*/?>\n?'ms;
+my $CodeRe     = qr'^[\t ]*<CODE(\d*)>(.*?)^\s*<\/CODE\1>\n?'ms;
+my $ConstMapRe = qr'^[\t ]*<CONSTANT_MAPPING>(.*?)^\s*</CONSTANT_MAPPING>\n?'ms;
+my $RegMapRe   = qr'^[\t ]*<REGISTER_MAPPING>(.*?)^\s*</REGISTER_MAPPING>\n?'ms;
+my $ScheduleRe = qr'^[\t ]*<SCHEDULE_BLOCK>(.*?)^\s*</SCHEDULE_BLOCK>\n?'ms;
+my $InlineRe   = qr'\[(\+|\-)(.+?)\1\]'ms;
+
+sub IncludeFile
+{
+    my ($file, $include) = @_;
+    my ($vol,$dir,$name) = File::Spec->splitpath($file);
+    local $/;
+    my $fh;
+    if (!open $fh, $file)
+    {
+        open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n";
+    }
+    my $content = <$fh>;
+    close $fh;
+    return $content;
+}
+
+sub Preprocess
+{
+    my ($file, $include, $debug, $regMap) = @_;
+
+    my $constMap = {};
+    my $removeRegMap;
+    if ($regMap)
+        { $removeRegMap = 1; }
+    else
+        { $regMap = {}; }
+
+    # include nested files
+    1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg;
+
+    # Strip out comments
+    $file =~ s|$CommentRe||g;
+
+    # Execute the CODE sections (old way to run code, to be deprecated)
+    1 while $file =~ s|$CodeRe|
+        my $out = eval "package PascalAs::PascalAs::CODE; $2";
+        $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg;
+
+    # Execute the inline code (new way)
+    $file =~ s|$InlineRe|
+        my ($type, $code) = ($1, $2);
+        my $out = eval "package PascalAs::PascalAs::CODE; $code";
+        $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg;
+
+    #Pull in the constMap
+    $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg;
+
+    my @newFile;
+    foreach my $line (split "\n", $file)
+    {
+        # skip comments
+        if ($line !~ m'^\s*(?:#|//).*')
+        {
+            $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg;
+        }
+        push @newFile, $line;
+    }
+    $file = join "\n", @newFile;
+
+    # Pull in the reg map first as the Scheduler will need it to handle vector instructions
+    # Remove the regmap if we're going on to assemble
+    $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg;
+
+    # Pick out the SCHEDULE_BLOCK sections
+    my @schedBlocks = $file =~ /$ScheduleRe/g;
+
+    # Schedule them
+    foreach my $i (0 .. $#schedBlocks)
+    {
+        # XMAD macros should only appear in SCHEDULE_BLOCKs
+        $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]);
+
+        $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug);
+    }
+
+    # Replace the results
+    $file =~ s|$ScheduleRe| shift @schedBlocks |eg;
+
+    return $file;
+}
+
+# break the registers down into source and destination categories for the scheduler
+my %srcReg   = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X);
+my %destReg  = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC);
+my %regops   = (%srcReg, %destReg);
+my @itypes   = qw(class lat rlat tput dual);
+
+sub Scheduler
+{
+    my ($block, $blockNum, $regMap, $debug) = @_;
+
+    my $vectors = $regMap->{__vectors};
+    my $lineNum = 0;
+
+    my (@instructs, @comments, $ordered, $first);
+    foreach my $line (split "\n", $block)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        unless (preProcessLine($line))
+        {
+            push @comments, $line if $line =~ m'\S';
+            next;
+        }
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # if the first instruction in the block is waiting on a dep, it should go first.
+            $inst->{first}   = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1;
+
+            # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block)
+            #$inst->{first}   = $inst->{ctrl} & 0x0000f ? 1 : 2;
+            $inst->{exeTime} = 0;
+            $inst->{order}   = $ordered++ if $ordered;
+            push @instructs, $inst;
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n";
+        }
+        # open an ORDERED block
+        elsif ($line =~ m'^<ORDERED>')
+        {
+            die "you cannot use nested <ORDERED> tags" if $ordered;
+            $ordered = 1;
+        }
+        # close an ORDERED block
+        elsif ($line =~ m'^</ORDERED>')
+        {
+            die "missing opening <ORDERED> for closing </ORDERED> tag" if !$ordered;
+            $ordered = 0;
+        }
+        else
+        {
+            die "badly formed line at block: $blockNum line: $lineNum: $line\n";
+        }
+    }
+
+    my (%writes, %reads, @ready, @schedule, $orderedParent);
+    # assemble the instructions to op codes
+    foreach my $instruct (@instructs)
+    {
+        my $match = 0;
+        foreach my $gram (@{$grammar{$instruct->{op}}})
+        {
+            my $capData = parseInstruct($instruct->{inst}, $gram) or next;
+            my (@dest, @src);
+
+            # copy over instruction types for easier access
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+
+            # A predicate prefix is treated as a source reg
+            push @src, $instruct->{predReg} if $instruct->{pred};
+
+            # Handle P2R and R2P specially
+            if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7})
+            {
+                my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src;
+                my $mask = hex($capData->{i20w7});
+                foreach my $p (0..6)
+                {
+                    if ($mask & (1 << $p))
+                    {
+                        push @$list, "P$p";
+                    }
+                    # make this instruction dependent on any predicates it's not setting
+                    # this is to prevent a race condition for any predicate sets that are pending
+                    elsif ($instruct->{op} eq 'R2P')
+                    {
+                        push @src, "P$p";
+                    }
+                }
+                # These instructions can't be dual issued
+                $instruct->{nodual} = 1;
+            }
+
+            # Populate our register source and destination lists, skipping any zero or true values
+            foreach my $operand (grep { exists $regops{$_} } sort keys %$capData)
+            {
+                # figure out which list to populate
+                my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src;
+
+                # Filter out RZ and PT
+                my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT';
+
+                if ($capData->{$operand} ne $badVal)
+                {
+                    # add the value to list with the correct prefix
+                    push @$list,
+                        $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) :
+                        $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) :
+                        $operand eq 'CC' ? 'CC' :
+                        $operand eq 'X'  ? 'CC' :
+                        getRegNum($regMap, $capData->{$operand});
+                }
+            }
+            $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39});
+
+            # Find Read-After-Write dependencies
+            foreach my $src (grep { exists $writes{$_} } @src)
+            {
+                # Memory operations get delayed access to registers but not to the predicate
+                my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat};
+
+                # the parent should be the most recently added dest op to the stack
+                foreach my $parent (@{$writes{$src}})
+                {
+                    # add this instruction as a child of the parent
+                    # set the edge to the total latency of reg source availability
+                    #print "R $parent->{inst}\n\t\t$instruct->{inst}\n";
+                    my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat};
+                    push @{$parent->{children}}, [$instruct, $latency - $regLatency];
+                    $instruct->{parents}++;
+
+                    # if the destination was conditionally executed, we also need to keep going back till it wasn't
+                    last unless $parent->{pred};
+                }
+            }
+
+            # Find Write-After-Read dependencies
+            foreach my $dest (grep { exists $reads{$_} } @dest)
+            {
+                # Flag this instruction as dependent to any previous read
+                foreach my $reader (@{$reads{$dest}})
+                {
+                    # no need to stall for these types of dependencies
+                    #print "W $reader->{inst} \t\t\t $instruct->{inst}\n";
+                    push @{$reader->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                # Once dependence is marked we can clear out the read list (unless this write was conditional).
+                # The assumption here is that you would never want to write out a register without
+                # subsequently reading it in some way prior to writing it again.
+                delete $reads{$dest} unless $instruct->{pred};
+            }
+
+            # Enforce instruction ordering where requested
+            if ($instruct->{order})
+            {
+                if ($orderedParent)
+                {
+                    push @{$orderedParent->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                $orderedParent = $instruct;
+            }
+            elsif ($orderedParent)
+                {  $orderedParent = 0; }
+
+            # For a dest reg, push it onto the write stack
+            unshift @{$writes{$_}}, $instruct foreach @dest;
+
+            # For a src reg, push it into the read list
+            push @{$reads{$_}}, $instruct foreach @src;
+
+            # if this instruction has no dependencies it's ready to go
+            push @ready, $instruct if !exists $instruct->{parents};
+
+            $match = 1;
+            last;
+        }
+        die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match;
+    }
+    %writes = ();
+    %reads  = ();
+
+    if (@ready)
+    {
+        # update dependent counts for sorting hueristic
+        my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" };
+
+        countUniqueDescendants($readyParent, {});
+        updateDepCounts($readyParent, {});
+
+        # sort the initial ready list
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    # Process the ready list, adding new instructions to the list as we go.
+    my $clock = 0;
+    while (my $instruct = shift @ready)
+    {
+        my $stall = $instruct->{stall};
+
+        # apply the stall to the previous instruction
+        if (@schedule && $stall < 16)
+        {
+            my $prev = $schedule[$#schedule];
+
+            # if stall is greater than 4 then also yield
+            # the yield flag is required to get stall counts 12-15 working correctly.
+            $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0;
+            $prev->{ctrl} |= $stall;
+            $clock += $stall;
+        }
+        # For stalls bigger than 15 we assume the user is managing it with a barrier
+        else
+        {
+            $instruct->{ctrl} &= 0x1fff0;
+            $instruct->{ctrl} |= 1;
+            $clock += 1;
+        }
+        print "$clock: $instruct->{inst}\n" if $debug;
+
+        # add a new instruction to the schedule
+        push @schedule, $instruct;
+
+        # update each child with a new earliest execution time
+        if (my $children = $instruct->{children})
+        {
+            foreach (@$children)
+            {
+                my ($child, $latency) = @$_;
+
+                # update the earliest clock value this child can safely execute
+                my $earliest = $clock + $latency;
+                $child->{exeTime} = $earliest if $child->{exeTime} < $earliest;
+
+                print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug;
+
+                # decrement parent count and add to ready queue if none remaining.
+                push @ready, $child if --$child->{parents} < 1;
+            }
+            delete $instruct->{children};
+        }
+
+        # update stall and mix values in the ready queue on each iteration
+        foreach my $ready (@ready)
+        {
+            # calculate how many instructions this would cause the just added instruction to stall.
+            $stall = $ready->{exeTime} - $clock;
+            $stall = 1 if $stall < 1;
+
+            # if using the same compute resource as the prior instruction then limit the throughput
+            if ($ready->{class} eq $instruct->{class})
+            {
+                $stall = $ready->{tput} if $stall < $ready->{tput};
+            }
+            # dual issue with a simple instruction (tput <= 2)
+            # can't dual issue two instructions that both load a constant
+            elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} &&
+                   $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const}))
+            {
+                $stall = 0;
+            }
+            $ready->{stall} = $stall;
+
+            # add an instruction class mixing huristic that catches anything not handled by the stall
+            $ready->{mix} = $ready->{class} ne $instruct->{class} || 0;
+        }
+
+        # sort the ready list by stall time, mixing huristic, dependencies and line number
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $a->{stall}   <=> $b->{stall}  ||
+            $b->{mix}     <=> $a->{mix}    ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    my $out;
+    #$out .= "$_\n" foreach @comments;
+    $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule;
+    return $out;
+}
+
+sub setConstMap
+{
+    my ($constMap, $constMapText) = @_;
+
+    foreach my $line (split "\n", $constMapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my ($name, $value) = split '\s*:\s*', $line;
+
+        $constMap->{$name} = $value;
+    }
+    return;
+}
+
+sub setRegisterMap
+{
+    my ($regMap, $regmapText) = @_;
+
+    my $vectors = $regMap->{__vectors} ||= {};
+    my $regBank = $regMap->{__regbank} ||= {};
+    my %aliases;
+
+    foreach my $line (split "\n", $regmapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my $auto  = $line =~ /~/;
+        my $share = $line =~ /=/;
+
+        my ($regNums, $regNames) = split '\s*[:~=]\s*', $line;
+
+        my (@numList, @nameList, %vecAliases);
+        foreach my $num (split '\s*,\s*', $regNums)
+        {
+            my ($start, $stop) = split '\s*\-\s*', $num;
+            die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop;
+            push @numList, ($start .. $stop||$start);
+        }
+        foreach my $fullName (split '\s*,\s*', $regNames)
+        {
+            if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$')
+            {
+                my ($name1, $name2, $bank) = ($1, $3, $4);
+                foreach (split '\s*\|\s*', $2)
+                {
+                    my ($start, $stop) = split '\s*\-\s*';
+                    foreach my $r (map "$name1$_$name2", $start .. $stop||$start)
+                    {
+                        # define an alias for use in vector instructions that omits the number portion
+                        $aliases{$r} = "$name1$name2" unless exists $aliases{$r};
+                        push @nameList, $r;
+                        $regBank->{$r} = $bank if $auto && defined $bank;
+                        warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank;
+                    }
+                }
+            }
+            elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$')
+            {
+                push @nameList, $1;
+                $regBank->{$1} = $2 if $auto && defined $2;
+                warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2;
+            }
+            else
+            {
+                die "Bad register name: '$fullName' at: $line\n";
+            }
+        }
+        die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList;
+        die "Missmatched register mapping at: $line\n" if $share && @numList > 1;
+
+        # detect if this list is monotonically ascending with no gaps
+        my $i = 0;
+        while ($i < $#numList-1)
+        {
+            last if $numList[$i] + 1 != $numList[$i+1];
+            $i++;
+        }
+        my $ascending = $i+1 == $#numList;
+
+        foreach my $n (0..$#nameList)
+        {
+            die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]};
+
+            if ($auto)
+            {
+                # assign possible values to be assigned on assembly
+                $regMap->{$nameList[$n]} = \@numList;
+            }
+            elsif ($share)
+            {
+                # each name shares the same single register
+                $regMap->{$nameList[$n]} = 'R' . $numList[0];
+            }
+            else
+            {
+                $regMap->{$nameList[$n]} = 'R' . $numList[$n];
+                # flag any even register as a potential vector
+                if ($ascending && ($numList[$n] & 1) == 0)
+                {
+                    # constrain potential range to vector alignment
+                    my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3);
+                    if ($end <= $#nameList)
+                    {
+                        $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ];
+                        #setup an alias for the base name without the number
+                        if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}})
+                        {
+                            $regMap->{$aliases{$nameList[$n]}}  = $regMap->{$nameList[$n]};
+                            $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]};
+                            delete $aliases{$nameList[$n]};
+                        }
+                    }
+                }
+            }
+        }
+    }
+    #print Dumper($regMap); exit(1);
+}
+
+sub preProcessLine
+{
+    # strip leading space
+    $_[0] =~ s|^\s+||;
+
+    # preserve comment but check for emptiness
+    my $val = shift;
+
+    # strip comments
+    $val =~ s{(?:#|//).*}{};
+
+    # skip blank lines
+    return $val =~ m'\S';
+}
+
+# traverse the graph and count total descendants per node.
+# only count unique nodes (by lineNum)
+sub countUniqueDescendants
+{
+    my ($node, $edges) = @_;
+
+    #warn "$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+
+            $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges);
+        }
+    }
+    else
+    {
+        return $node->{lineNum};
+    }
+    return ($node->{lineNum}, keys %{$node->{deps}});
+}
+# convert hash to count for easier sorting.
+sub updateDepCounts
+{
+    my ($node, $edges) = @_;
+
+    #warn "$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (@$children)
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+            updateDepCounts($child->[0], $edges);
+        }
+    }
+    $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0;
+}
+
+# Detect register bank conflicts and calculate reuse stats
+sub registerHealth
+{
+    my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_;
+
+    my (@banks, @conflicts);
+
+    foreach my $slot (qw(r8 r20 r39))
+    {
+        my $r = $capData->{$slot} or next;
+        next if $r eq 'RZ';
+
+        my $slotHist = $reuseHistory->{$slot} ||= {};
+
+        $reuseHistory->{total}++;
+
+        # if this register is in active reuse then ignore for bank conflict checking.
+        if (exists $slotHist->{$r})
+        {
+            $reuseHistory->{reuse}++;
+        }
+        else
+        {
+            # extract number from reg and take the modulo-4 value.  This is the bank id.
+            my $bank = substr($r,1) & 3;
+
+            # check for conflict
+            if ($banks[$bank] && $banks[$bank] ne $r)
+            {
+                push @conflicts, $banks[$bank] if !@conflicts;
+                push @conflicts, $r;
+
+                $reuseHistory->{conflicts}++;
+            }
+            $banks[$bank] = $r;
+        }
+
+        # update the history
+        if ($reuseFlags & $reuseSlots{$slot})
+            { $slotHist->{$r} = 1; }
+        else
+            { delete $slotHist->{$r};  }
+    }
+    if ($inst && @conflicts && !$nowarn)
+    {
+        printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts);
+    }
+    return scalar @conflicts;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+PascalAs::PascalAs - Assembler for NVIDIA Maxwell architecture
+
+=head1 SYNOPSIS
+
+    Pascalas.pl [opts]
+
+=head1 DESCRIPTION
+
+See the documentation at: https://github.com/NervanaSystems/pascalas
+
+=head1 SEE ALSO
+
+See the documentation at: https://github.com/NervanaSystems/pascalas
+
+
+=head1 AUTHOR
+
+Scott Gray, E<lt>sgray@nervanasys.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+=cut
diff --git a/Assembler/PascalAs/blib/lib/PascalAs/PascalAsGrammar.pm b/Assembler/PascalAs/blib/lib/PascalAs/PascalAsGrammar.pm
new file mode 100644
index 0000000..bf25fb8
--- /dev/null
+++ b/Assembler/PascalAs/blib/lib/PascalAs/PascalAsGrammar.pm
@@ -0,0 +1,1437 @@
+package PascalAs::PascalAsGrammar;
+
+use strict;
+use Carp;
+use Exporter;
+use Data::Dumper;
+our @ISA = qw(Exporter);
+
+our @EXPORT = qw(
+    %grammar %flags
+    parseInstruct genCode genReuseCode
+    processAsmLine processSassLine processSassCtrlLine
+    replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters
+);
+
+require 5.10.0;
+
+# Helper functions for operands
+sub getI
+{
+    my ($orig, $pos, $mask) = @_;
+    my $val = $orig;
+    my $neg = $val =~ s|^\-||;
+
+    # parse out our custom index immediates for addresses
+    if ($val  =~ m'^(\d+)[xX]<([^>]+)>')
+    {
+        # allow any perl expression and multiply result by leading decimal.
+        # also allow global scalar varibles in the expression.
+        my $mul = $1;
+        my $exp = $2;
+        # strip leading zeros (don't interpret numbers as octal)
+        $exp =~ s/(?<!\d)0+(?=[1-9])//g;
+        my @globals = $exp =~ m'\$\w+'g;
+        my $our = @globals ? ' our (' . join(',',@globals) . ');' : '';
+        $val = $mul * eval "package PascalAs::PascalAs::CODE;$our $exp";
+        #print "$val = $mul x $exp\n"; # if $our;
+    }
+    # hexidecial value
+    elsif ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # otherwise val is a simple decimal value that doesn't need to be modified
+
+    if ( $neg )
+    {
+        # if the mask removes the sign bit the "neg" flag adds it back on the code somewhere else
+        $val = -$val;
+        $val &= $mask;
+    }
+    if (($val & $mask) != $val)
+    {
+        die sprintf "Immediate value out of range(0x%x): 0x%x ($orig)\n", $mask, $val;
+    }
+    return $val << $pos;
+}
+sub getF
+{
+    my ($val, $pos, $type, $trunc) = @_;
+    # hexidecial value
+    if ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # support infinity
+    elsif ($val =~ m'INF'i)
+    {
+        $val = $trunc ? ($type eq 'f' ? 0x7f800 : 0x7ff00) : 0x7f800000;
+    }
+    else
+    {
+        $val = unpack(($type eq 'f' ? 'L' : 'Q'), pack $type, $val);
+
+        # strip off sign bit if truncating.  It will added elsewhere in the code by the flag capture.
+        $val = ($val >> $trunc) & 0x7ffff if $trunc;
+    }
+    return $val << $pos;
+}
+sub getR
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^R(\d+|Z)$' && $1 < 255)
+    {
+        $val = $1 eq 'Z' ? 0xff : $1;
+    }
+    else
+    {
+        die "Bad register name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getP
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^P(\d|T)$' && $1 < 7)
+    {
+        $val = $1 eq 'T' ? 7 : $1;
+    }
+    else
+    {
+        die "Bad predicate name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 }
+
+# Map operands into their value and position in the op code.
+my %operands =
+(
+    p0      => sub { getP($_[0], 0)  },
+    p3      => sub { getP($_[0], 3)  },
+    p12     => sub { getP($_[0], 12) },
+    p29     => sub { getP($_[0], 29) },
+    p39     => sub { getP($_[0], 39) },
+    p45     => sub { getP($_[0], 45) },
+    p48     => sub { getP($_[0], 48) },
+    p58     => sub { getP($_[0], 58) },
+    r0      => sub { getR($_[0], 0)  },
+    r8      => sub { getR($_[0], 8)  },
+    r20     => sub { getR($_[0], 20) },
+    r28     => sub { getR($_[0], 28) },
+    r39s20  => sub { getR($_[0], 39) },
+    r39     => sub { getR($_[0], 39) },
+    r39a    => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20
+    c20     => sub { getC($_[0])     },
+    c39     => sub { getC($_[0])     },
+    c34     => sub { hex($_[0]) << 34 },
+    c36     => sub { hex($_[0]) << 36 },
+    f20w32  => sub { getF($_[0], 20, 'f')        },
+    f20     => sub { getF($_[0], 20, 'f', 12)    },
+    d20     => sub { getF($_[0], 20, 'd', 44)    },
+    i8w4    => sub { getI($_[0], 8,  0xf)        },
+    i20     => sub { getI($_[0], 20, 0x7ffff)    },
+    i20w6   => sub { getI($_[0], 20, 0x3f)       },
+    i20w7   => sub { getI($_[0], 20, 0x7f)       },
+    i20w8   => sub { getI($_[0], 20, 0xff)       },
+    i20w12  => sub { getI($_[0], 20, 0xfff)      },
+    i20w24  => sub { getI($_[0], 20, 0xffffff)   },
+    i20w32  => sub { getI($_[0], 20, 0xffffffff) },
+    i31w4   => sub { getI($_[0], 31, 0xf)        },
+    i34w13  => sub { getI($_[0], 34, 0x1fff)     },
+    i36w20  => sub { getI($_[0], 36, 0xfffff)    },
+    i39w8   => sub { getI($_[0], 39, 0xff)       },
+    i28w8   => sub { getI($_[0], 28, 0xff)       },
+    i28w20  => sub { getI($_[0], 28, 0xfffff)    },
+    i48w8   => sub { getI($_[0], 48, 0xff)       },
+    i51w5   => sub { getI($_[0], 51, 0x1f)       },
+    i53w5   => sub { getI($_[0], 53, 0x1f)       },
+);
+
+# Rules for operands and their closely tied flags
+my $hex     = qr"0[xX][0-9a-fA-F]+";
+my $iAddr   = qr"\d+[xX]<[^>]+>";
+my $immed   = qr"$hex|$iAddr|\d+"o;
+my $reg     = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\
+my $p       = qr"P[0-6T]";
+my $noPred  = qr"(?<noPred>)";
+my $pred    = qr"\@(?<predNot>\!)?P(?<predNum>[0-6]) ";
+my $p0      = qr"(?<p0>$p)"o;
+my $p3      = qr"(?<p3>$p)"o;
+my $p12     = qr"(?<p12not>\!)?(?<p12>$p)"o;
+my $p29     = qr"(?<p29not>\!)?(?<p29>$p)"o;
+my $p39     = qr"(?<p39not>\!)?(?<p39>$p)"o;
+my $p45     = qr"(?<p45>$p)"o;
+my $p48     = qr"(?<p48>$p)"o;
+my $p58     = qr"(?<p58>$p)"o;
+my $r0      = qr"(?<r0>$reg)";
+my $r0cc    = qr"(?<r0>$reg)(?<CC>\.CC)?";
+my $r8      = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse1>\.reuse)?";
+my $r20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse2>\.reuse)?";
+my $r28     = qr"(?<r28>$reg)";
+my $r39s20  = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r39s20>(?<r20>$reg))\|?(?:\.(?<r39part>H0|H1))?(?<reuse2>\.reuse)?";
+my $r39     = qr"(?<r39neg>\-)?(?<r39>$reg)(?:\.(?<r39part>H0|H1))?(?<reuse3>\.reuse)?";
+my $r39a    = qr"(?<r39a>(?<r39>$reg))(?<reuse3>\.reuse)?";
+my $c20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3))?"o;
+my $c20x    = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20partx>H0|H1|B0|B1|B2|B3))?"o;
+my $c20s39  = qr"(?<r39neg>\-)?c\[(?<c34>$hex)\]\s*\[(?<c39>$hex)\]"o;
+my $f20w32  = qr"(?<f20w32>(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))";
+my $f20     = qr"(?<f20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $d20     = qr"(?<d20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $i8w4    = qr"(?<i8w4>$immed)"o;
+my $i20     = qr"(?<i20>(?<neg>\-)?$immed)(?<r20neg>\.NEG)?"o;
+my $i20w6   = qr"(?<i20w6>$immed)"o;
+my $i20w7   = qr"(?<i20w7>$immed)"o;
+my $i20w8   = qr"(?<i20w8>$immed)"o;
+my $i20w12  = qr"(?<i20w12>$immed)"o;
+my $i20w24  = qr"(?<i20w24>\-?$immed)"o;
+my $i20w32  = qr"(?<i20w32>\-?$immed)"o;
+my $i39w8   = qr"(?<i39w8>\-?$immed)"o;
+my $i28w8   = qr"(?<i28w8>$immed)"o;
+my $i28w20  = qr"(?<i28w20>\-?$immed)"o;
+my $i31w4   = qr"(?<i31w4>$immed)"o;
+my $i34w13  = qr"(?<i34w13>$immed)"o;
+my $i36w20  = qr"(?<i36w20>$immed)"o;
+my $i48w8   = qr"(?<i48w8>$immed)"o;
+my $i51w5   = qr"(?<i51w5>$immed)"o;
+my $i53w5   = qr"(?<i53w5>$immed)"o;
+my $ir20    = qr"$i20|$r20"o;
+my $cr20    = qr"$c20|$r20"o;
+my $icr20   = qr"$i20|$c20|$r20"o;
+my $fcr20   = qr"$f20|$c20|$r20"o;
+my $cr39    = qr"$c20s39|$r39"o;
+my $dr20    = qr"$d20|$r20"o;
+
+# Instruction specific rules for capturing various flags
+my $u32   = qr"(?<U32>\.U32)?";
+my $ftz   = qr"(?<FTZ>\.FTZ)?";
+my $sat   = qr"(?<SAT>\.SAT)?";
+my $rnd   = qr"(?:\.(?<rnd>RN|RM|RP|RZ))?";
+my $round = qr"(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?";
+my $fcmp  = qr"(?<cmp>\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)";
+my $icmp  = qr"\.(?<cmp>LT|EQ|LE|GT|NE|GE)";
+my $bool  = qr"\.(?<bool>AND|OR|XOR|PASS_B)";
+my $bool2 = qr"\.(?<bool2>AND|OR|XOR)";
+my $func  = qr"\.(?<func>COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)";
+my $rro   = qr"\.(?<func>SINCOS|EX2)";
+my $add3  = qr"(?:\.(?<type>X|RS|LS))?";
+my $lopz  = qr"(?:\.(?<z>NZ|Z) $p48,|(?<noz>))"o;
+my $X     = qr"(?<X>\.X)?";
+my $tld   = qr"(?<NODEP>NODEP\.)?(?:(?<reuse1>T)|(?<reuse2>P))";
+my $chnls = qr"(?<chnls>R|RGBA)";
+my $sr    = qr"SR_(?<sr>\S+)";
+my $shf   = qr"(?<W>\.W)?(?:\.(?<type>U64|S64))?(?<HI>\.HI)?";
+my $xmad  = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<mode>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $xmadc = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<modec>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $vmad8 = qr"\.(?<sign1>[SU])(?<size1>8|16)\.(?<sign2>[SU])(?<size2>8|16)(?<PO>\.PO)?(?<SHR_7>\.SHR_7)?(?<SHR_15>\.SHR_15)?(?<SAT>\.SAT)?";
+my $vmad16= qr"\.(?<sign1>[SU])(?<size1>16)\.(?<sign2>[SU])(?<size2>16)";
+my $hilo  = qr"(?:\.(?<mode>XHI|XLO))?";
+my $vaddType = qr"(?:\.(?<UD>UD))?(?:\.(?<SD>SD))?(?:\.(?<sign1>[SU])(?<size1>8|16|32))?(?:\.(?<sign2>[SU])(?<size2>8|16|32))?";
+my $vaddMode = qr"(?:\.(?<mode>MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?";
+my $vmnmx = qr"(?:\.(?<MX>MX))?";
+my $x2x   = qr"\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)";
+my $prmt  = qr"(?:\.(?<mode>F4E|B4E|RC8|ECL|ECR|RC16))?";
+my $shfl  = qr"\.(?<mode>IDX|UP|DOWN|BFLY)";
+my $bar   = qr"\.(?<mode>SYNC|ARV|RED)(?:\.(?<red>POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?(<r20>)|(?<nor20>))(?(<red>), $p39|(?<nop39>))"o;
+my $b2r   = qr"\.RESULT $r0(?:, $p45|(?<nop45>))"o;
+my $dbar  = qr"(?<SB>SB0|SB1|SB2|SB3|SB4|SB5)";
+my $dbar2 = qr" {(?<db5>5)?,?(?<db4>4)?,?(?<db3>3)?,?(?<db2>2)?,?(?<db1>1)?,?(?<db0>0)?}";
+my $mbar  = qr"\.(?<mode>CTA|GL|SYS)";
+my $addr  = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i20w24)?\]"o;
+my $addr2 = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i28w20)?\]"o;
+my $ldc   = qr"c\[(?<c36>$hex)\]\s*$addr"o;
+my $atom  = qr"(?<E>\.E)?(?:\.(?<mode>ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?<type>|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)";
+my $vote  = qr"\.(?<mode>ALL|ANY|EQ)"o;
+my $memType  = qr"(?<type>\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)";
+my $memCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|CI|CS|CV|IL|WT))?";
+
+
+
+# class: hardware resource that shares characteristics with types
+# lat  : pipeline depth where relevent, placeholder for memory ops
+# blat : barrier latency, typical fetch time for memory operations. Highly variable.
+# rlat : operand read latency for memory ops
+# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op.
+# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession.
+# dual : whether this instruction type can be dual issued
+# reuse: whether this instruction type accepts register reuse flags.
+
+# Some of these values are guesses and need to be updated from micro benchmarks.
+# We may need to split these classes up further.
+my $s2rT  = {class => 's2r',   lat => 2,   blat => 25,  rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $smemT = {class => 'mem',   lat => 2,   blat => 30,  rlat => 2, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $gmemT = {class => 'mem',   lat => 2,   blat => 200, rlat => 4, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $x32T  = {class => 'x32',   lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 1};
+my $x64T  = {class => 'x64',   lat => 2,   blat => 128, rlat => 0, rhold => 0,  tput => 128, dual => 0, reuse => 1};
+my $shftT = {class => 'shift', lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $cmpT  = {class => 'cmp',   lat => 13,  blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $qtrT  = {class => 'qtr',   lat => 8,   blat => 0,   rlat => 4, rhold => 0,  tput => 1,   dual => 1, reuse => 0};
+my $rroT  = {class => 'rro',   lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $voteT = {class => 'vote',  lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+
+
+# Create map of op names to rules
+our %grammar =
+(
+    #Floating Point Instructions
+    FADD     => [ { type => $x32T,  code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FADD32I  => [ { type => $x32T,  code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FCHK     => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o,                     } ], #Partial?
+    FCMP     => [ { type => $cmpT,  code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o,            } ],
+    FFMA     => [
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o,         },
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o,     },
+                ],
+    FMNMX    => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o,                } ],
+    FMUL     => [ { type => $x32T,  code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FMUL32I  => [ { type => $x32T,  code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FSET     => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o,       } ],
+    FSETP    => [ { type => $cmpT,  code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ],
+    MUFU     => [ { type => $qtrT,  code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o,                              } ],
+    RRO      => [ { type => $rroT,  code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o,                               } ],
+    DADD     => [ { type => $x64T,  code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o,                        } ],
+    DFMA     => [ { type => $x64T,  code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o,                  } ],
+    DMNMX    => [ { type => $cmpT,  code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o,                     } ],
+    DMUL     => [ { type => $x64T,  code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o,                        } ],
+    DSET     => [ { type => $cmpT,  code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o,            } ],
+    DSETP    => [ { type => $cmpT,  code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o,      } ],
+    FSWZADD  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o,                                    } ], #TODO
+
+    HADD2     => [ { type => $x32T,  code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o,               } ],
+    HMUL2     => [ { type => $x32T,  code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o,               } ],
+    HFMA2     => [ { type => $x32T,  code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o,         } ],
+    HSETP2    => [ { type => $cmpT,  code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial
+
+    #Integer Instructions
+    BFE       => [ { type => $shftT,  code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o,                          } ],
+    BFI       => [ { type => $shftT,  code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o,                        } ],
+    FLO       => [ { type => $s2rT,   code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o,                              } ],
+    IADD      => [ { type => $x32T,   code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o,                         } ],
+    IADD32I   => [ { type => $x32T,   code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o,                         } ],
+    IADD3     => [ { type => $x32T,   code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o,                 } ],
+    ICMP      => [ { type => $cmpT,   code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o,              } ],
+    IMNMX     => [ { type => $shftT,  code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o,                  } ],
+    ISET      => [ { type => $shftT,  code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o,       } ],
+    ISETP     => [ { type => $cmpT,   code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ],
+    ISCADD    => [ { type => $shftT,  code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o,                   } ],
+    ISCADD32I => [ { type => $shftT,  code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o,               } ],
+    LEA       => [
+                   { type => $cmpT,   code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o,                      },
+                   { type => $shftT,  code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o,                    },
+                   { type => $shftT,  code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o,          },
+                   { type => $shftT,  code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o,          },
+                 ],
+    LOP       => [ { type => $x32T,   code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?<INV>~)?$icr20(?<INV>\.INV)?;"o, } ],
+    LOP32I    => [ { type => $x32T,   code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o,                     } ],
+    LOP3      => [
+                   { type => $x32T,   code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o,            },
+                   { type => $x32T,   code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o,            },
+                 ],
+    POPC      => [ { type => $s2rT,   code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o,                                    } ],
+    SHF       => [
+                   { type => $shftT,  code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o,                  },
+                   { type => $shftT,  code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o,                  },
+                 ],
+    SHL       => [ { type => $shftT,  code => 0x5c48000000000000, rule => qr"^$pred?SHL(?<W>\.W)? $r0, $r8, $icr20;"o,                    } ],
+    SHR       => [ { type => $shftT,  code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o,                          } ],
+    XMAD      => [
+                   { type => $x32T,   code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o,                 },
+                   { type => $x32T,   code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o,            },
+                   { type => $x32T,   code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o,                  },
+                 ],
+    # XMAD replaces these
+    IMAD      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o,   } ], #TODO
+    IMADSP    => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO
+    IMUL      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o,   } ], #TODO
+
+    #Conversion Instructions
+    F2F => [ { type => $qtrT,  code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ],
+    F2I => [ { type => $qtrT,  code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o,         } ],
+    I2F => [ { type => $qtrT,  code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o,               } ],
+    I2I => [ { type => $qtrT,  code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o,               } ],
+
+    #Movement Instructions
+    MOV    => [ { type => $x32T,  code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o,                   } ],
+    MOV32I => [ { type => $x32T,  code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o,   } ],
+    PRMT   => [ { type => $x32T,  code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ],
+    SEL    => [ { type => $x32T,  code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o,        } ],
+    SHFL   => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ],
+
+    #Predicate/CC Instructions
+    PSET   => [ { type => $cmpT,  code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o,       } ],
+    PSETP  => [ { type => $cmpT,  code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ],
+    CSET   => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o,  } ], #TODO
+    CSETP  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO
+    P2R    => [ { type => $x32T,  code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o,   } ],
+    R2P    => [ { type => $cmpT,  code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o,   } ],
+
+    #Texture Instructions
+    # Handle the commonly used 1D texture functions.. but save the others for later
+    TLD    => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial
+    TLDS   => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial
+    TEX    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o,   } ], #TODO
+    TLD4   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o,  } ], #TODO
+    TXQ    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o,   } ], #TODO
+    TEXS   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o,  } ], #TODO
+    TLD4S  => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO
+
+    #Compute Load/Store Instructions
+    LD     => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o,      } ],
+    ST     => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o,      } ],
+    LDG    => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o,           } ],
+    STG    => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o,           } ],
+    LDS    => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o,           } ],
+    STS    => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o,           } ],
+    LDL    => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o,           } ],
+    STL    => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o,           } ],
+    LDC    => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o,            } ],
+    # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded).
+    ATOM   => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o,   } ],
+    ATOMS  => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o,  } ],
+    RED    => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o,                      } ],
+    CCTL   => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o,  } ], #TODO
+    CCTLL  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO
+    CCTLT  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO
+
+    #Surface Memory Instructions (haven't gotten to these yet..)
+    SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO
+    SULD   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o,   } ], #TODO
+    SURED  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o,  } ], #TODO
+    SUST   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o,   } ], #TODO
+
+    #Control Instructions
+    BRA    => [
+                { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?<U>\.U)? $i20w24;"o,         },
+                { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?<U>\.U)? CC\.EQ, $i20w24;"o, },
+              ],
+    BRX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o,                      } ], #TODO
+    JMP    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o,                      } ], #TODO
+    JMX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o,                      } ], #TODO
+    SSY    => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o,                 } ],
+    SYNC   => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o,                          } ],
+    CAL    => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o,                 } ],
+    JCAL   => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o,                } ],
+    PRET   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o,                     } ], #TODO
+    RET    => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o,                           } ],
+    BRK    => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o,                           } ],
+    PBK    => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o,                 } ],
+    CONT   => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o,                          } ],
+    PCNT   => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o,                } ],
+    EXIT   => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o,                          } ],
+    PEXIT  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o,                    } ], #TODO
+    BPT    => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o,           } ],
+
+    #Miscellaneous Instructions
+    NOP    => [ { type => $x32T,  code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o,                                     } ],
+    CS2R   => [ { type => $x32T,  code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o,                           } ],
+    S2R    => [ { type => $s2rT,  code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o,                            } ],
+    B2R    => [ { type => $x32T,  code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o,                                 } ],
+    BAR    => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o,                                 } ],
+    DEPBAR => [
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, },
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o,              },
+              ],
+    MEMBAR => [ { type => $x32T,  code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o,                             } ],
+    VOTE   => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?<nor0>))$p45, $p39;"o, } ],
+    R2B    => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o,                                } ], #TODO
+
+    #Video Instructions... Need to finish
+    VADD   => [   { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMAD   => [
+                  { type => $x32T,  code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, },
+                  { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, },
+              ],
+    VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMNMX    => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+
+    VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+);
+
+# Create map of capture groups to op code flags that need to be added (or removed)
+my @flags = grep /\S/, split "\n", q{;
+
+BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD
+0x0100000000000000 neg
+
+FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP
+0x0100000000000000 neg
+
+PSET, PSETP
+0x0000000000008000 p12not
+0x0000000100000000 p29not
+
+FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE
+0x0000040000000000 p39not
+
+IADD, IADD3, XMAD, LEA, IMNMX
+0x0000800000000000 CC
+
+IADD32I
+0x0010000000000000 CC
+
+LEA
+0x0000000000000000 X
+
+SHF
+0x0004000000000000 W
+0x0001000000000000 HI
+
+SHF: type
+0x0000004000000000 U64
+0x0000006000000000 S64
+
+SHR, IMNMX, ISETP, ISET, ICMP, BFE
+0x0001000000000000 U32
+
+SHL
+0x0000008000000000 W
+
+SHFL
+0x0000000010000000 i20w8
+0x0000000020000000 i34w13
+
+SHFL: mode
+0x0000000000000000 IDX
+0x0000000040000000 UP
+0x0000000080000000 DOWN
+0x00000000c0000000 BFLY
+
+IMNMX: mode
+0x0000080000000000 XLO
+0x0000180000000000 XHI
+
+ISETP, ISET, ICMP: cmp
+0x0002000000000000 LT
+0x0004000000000000 EQ
+0x0006000000000000 LE
+0x0008000000000000 GT
+0x000a000000000000 NE
+0x000c000000000000 GE
+
+ISETP, ISET, PSETP, PSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+PSETP, PSET: bool2
+0x0000000000000000 AND
+0x0000000001000000 OR
+0x0000000002000000 XOR
+
+ISETP, ISET
+0x0000080000000000 X
+
+LOP: bool
+0x0000000000000000 AND
+0x0000020000000000 OR
+0x0000040000000000 XOR
+0x0000060000000000 PASS_B
+
+LOP:
+0x0000010000000000 INV
+
+LOP: z
+0x0000200000000000 Z
+0x0000300000000000 NZ
+
+LOP
+0x0007000000000000 noz
+
+LOP32I: bool
+0x0000000000000000 AND
+0x0020000000000000 OR
+0x0040000000000000 XOR
+
+PRMT: mode
+0x0001000000000000 F4E
+0x0002000000000000 B4E
+0x0003000000000000 RC8
+0x0004000000000000 ECL
+0x0005000000000000 ECR
+0x0006000000000000 RC16
+
+XMAD: type1
+0x0000000000000000 U16
+0x0001000000000000 S16
+
+XMAD: type2
+0x0000000000000000 U16
+0x0002000000000000 S16
+
+XMAD: mode
+0x0000002000000000 MRG
+0x0000001000000000 PSL
+0x0008000000000000 CHI
+0x0004000000000000 CLO
+0x000c000000000000 CSFU
+
+XMAD: modec
+0x0004000000000000 CLO
+0x0008000000000000 CHI
+0x000c000000000000 CSFU
+0x0040000000000000 X
+0x0080000000000000 PSL
+0x0100000000000000 MRG
+
+XMAD
+0x0010000000000000 CBCC
+
+XMAD: r8part
+0x0000000000000000 H0
+0x0020000000000000 H1
+
+XMAD: r20part
+0x0000000000000000 H0
+0x0000000800000000 H1
+
+XMAD: r20partx
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+XMAD: r39part
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part
+0x0000000000000000 B0
+0x0000001000000000 B1
+0x0000002000000000 B2
+0x0000003000000000 B3
+0x0000001000000000 H1
+0x0000000000000000 H0
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part
+0x0000000000000000 B0
+0x0000000010000000 B1
+0x0000000020000000 B2
+0x0000000030000000 B3
+0x0000000010000000 H1
+0x0000000000000000 H0
+
+VMAD
+0x0040000000000000 r8neg
+0x0020000000000000 r39neg
+0x0008000000000000 SHR_7
+0x0010000000000000 SHR_15
+0x0060000000000000 PO
+0x0080000000000000 SAT
+
+VMNMX
+0x0100000000000000 MX
+
+VADD, VABSDIFF, VMNMX
+0x0080000000000000 SAT
+0x0040000000000000 UD
+0x0040000000000000 SD
+
+VSET: cmp
+0x0040000000000000 LT
+0x0080000000000000 EQ
+0x00c0000000000000 LE
+0x0100000000000000 GT
+0x0140000000000000 NE
+0x0180000000000000 GE
+
+VADD, VSET: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VABSDIFF: mode
+0x0003000000000000 ACC
+0x000b000000000000 MIN
+0x0013000000000000 MAX
+0x0023000000000000 MRG_16H
+0x002b000000000000 MRG_16L
+0x0033000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x003b000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMNMX: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1
+0x0000000000000000 U
+0x0001000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2
+0x0000000000000000 U
+0x0002000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size1
+0x0000000000000000 8
+0x0000004000000000 16
+0x0000006000000000 32
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size2
+0x0000000000000000 8
+0x0000000040000000 16
+0x0000000060000000 32
+
+IADD3: type
+0x0001000000000000 X
+0x0000002000000000 RS
+0x0000004000000000 LS
+
+IADD3: r8part
+0x0000000000000000 H0
+0x0000001000000000 H1
+
+IADD3: r20part
+0x0000000080000000 H0
+
+IADD3: r39part
+0x0000000200000000 H0
+
+IADD3
+0x0008000000000000 r8neg
+0x0004000000000000 r20neg
+0x0002000000000000 r39neg
+
+IADD
+0x0000080000000000 X
+0x0004000000000000 SAT
+
+IADD, ISCADD
+0x0002000000000000 r8neg
+0x0001000000000000 r20neg
+
+IADD32I
+0x0100000000000000 r8neg
+0x0020000000000000 X
+
+DEPBAR: SB
+0x0000000000000000 SB0
+0x0000000004000000 SB1
+0x0000000008000000 SB2
+0x000000000c000000 SB3
+0x0000000010000000 SB4
+0x0000000014000000 SB5
+
+DEPBAR: cmp
+0x0000000020000000 LE
+
+DEPBAR
+0x0000000000000001 db0
+0x0000000000000002 db1
+0x0000000000000004 db2
+0x0000000000000008 db3
+0x0000000000000010 db4
+0x0000000000000020 db5
+
+F2F, F2I, I2F, I2I: destWidth
+0x0000000000000000 8
+0x0000000000000100 16
+0x0000000000000200 32
+0x0000000000000300 64
+
+F2F, F2I, I2F, I2I: srcWidth
+0x0000000000000000 8
+0x0000000000000400 16
+0x0000000000000800 32
+0x0000000000000c00 64
+
+F2F, F2I, I2F, I2I: destSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000001000 S
+
+F2F, F2I, I2F, I2I: srcSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000002000 S
+
+F2I, I2F, I2I: r20part
+0x0000000000000000 H0
+0x0000040000000000 H1
+0x0000000000000000 B0
+0x0000020000000000 B1
+0x0000040000000000 B2
+0x0000060000000000 B3
+
+F2F: r20part
+0x0000000000000000 H0
+0x0000020000000000 H1
+
+F2F: round
+0x0000040000000000 ROUND
+0x0000048000000000 FLOOR
+0x0000050000000000 CEIL
+0x0000058000000000 TRUNC
+
+F2I: round
+0x0000000000000000 ROUND
+0x0000008000000000 FLOOR
+0x0000010000000000 CEIL
+0x0000018000000000 TRUNC
+
+HADD2, HMUL2: r8part
+0x0001000000000000 H0_H0
+0x0000000000000000 H1_H1
+
+HFMA2: r20part
+0x0000000020000000 H0_H0
+0x0000000030000000 H1_H1
+
+FADD, DADD, FMUL, DMUL, F2F, I2F: rnd
+0x0000000000000000 RN
+0x0000008000000000 RM
+0x0000010000000000 RP
+0x0000018000000000 RZ
+
+DFMA: rnd
+0x0000000000000000 RN
+0x0004000000000000 RM
+0x0008000000000000 RP
+0x000c000000000000 RZ
+
+FFMA: rnd
+0x0000000000000000 RN
+0x0008000000000000 RM
+0x0010000000000000 RP
+0x0018000000000000 RZ
+
+FFMA
+0x0020000000000000 FTZ
+
+F2F, F2I, FADD, FMUL, FMNMX
+0x0000100000000000 FTZ
+
+FADD32I
+0x0080000000000000 FTZ
+
+FMUL32I
+0x0020000000000000 FTZ
+
+FSET
+0x0080000000000000 FTZ
+
+FSETP, FCMP
+0x0000800000000000 FTZ
+
+HADD2, HMUL2
+0x0000008000000000 FTZ
+
+HFMA2
+0x0000002000000000 FTZ
+
+FADD, FFMA, FMUL, F2F, I2I
+0x0004000000000000 SAT
+
+FADD, DADD, FMNMX, DMNMX, MUFU
+0x0001000000000000 r8neg
+
+FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I
+0x0000200000000000 r20neg
+
+FMUL, DMUL, FFMA, DFMA
+0x0001000000000000 r20neg
+
+FFMA, DFMA
+0x0002000000000000 r39neg
+
+FADD, DADD, FMNMX, DMNMX
+0x0000400000000000 r8abs
+
+FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I
+0x0002000000000000 r20abs
+
+FSETP, DSETP, FSET, DSET
+0x0000080000000000 r8neg
+0x0000000000000040 r20neg
+0x0000000000000080 r8abs
+0x0000100000000000 r20abs
+
+RRO: func
+0x0000000000000000 SINCOS
+0x0000008000000000 EX2
+
+MUFU: func
+0x0000000000000000 COS
+0x0000000000100000 SIN
+0x0000000000200000 EX2
+0x0000000000300000 LG2
+0x0000000000400000 RCP
+0x0000000000500000 RSQ
+0x0000000000600000 RCP64H
+0x0000000000700000 RSQ64H
+
+FSETP, DSETP, FSET, DSET, FCMP: cmp
+0x0001000000000000 .LT
+0x0002000000000000 .EQ
+0x0003000000000000 .LE
+0x0004000000000000 .GT
+0x0004000000000000
+0x0005000000000000 .NE
+0x0006000000000000 .GE
+0x0007000000000000 .NUM
+0x0008000000000000 .NAN
+0x0009000000000000 .LTU
+0x000a000000000000 .EQU
+0x000b000000000000 .LEU
+0x000c000000000000 .GTU
+0x000d000000000000 .NEU
+0x000e000000000000 .GEU
+
+FSETP, DSETP, FSET, DSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+HSETP2: cmp
+0x0000002800000000 .NE
+
+HSETP2: bool
+0x0000000000000000 AND
+
+S2R: sr
+0x0000000000000000 LANEID
+0x0000000000200000 VIRTCFG
+0x0000000000300000 VIRTID
+0x0000000002100000 TID.X
+0x0000000002200000 TID.Y
+0x0000000002300000 TID.Z
+0x0000000002500000 CTAID.X
+0x0000000002600000 CTAID.Y
+0x0000000002700000 CTAID.Z
+0x0000000003800000 EQMASK
+0x0000000003900000 LTMASK
+0x0000000003a00000 LEMASK
+0x0000000003b00000 GTMASK
+0x0000000003c00000 GEMASK
+
+CS2R: sr
+0x0000000005000000 CLOCKLO
+0x0000000005100000 CLOCKHI
+0x0000000005200000 GLOBALTIMERLO
+0x0000000005300000 GLOBALTIMERHI
+
+B2R
+0x0000e00000000000 nop45
+
+BAR
+0x0000100000000000 i8w4
+0x0000080000000000 nor20
+0x0000038000000000 nop39
+
+BAR: mode
+0x0000000000000000 SYNC
+0x0000000100000000 ARV
+0x0000000200000000 RED
+
+BAR: red
+0x0000000000000000 POPC
+0x0000000800000000 AND
+0x0000001000000000 OR
+
+MEMBAR: mode
+0x0000000000000000 CTA
+0x0000000000000100 GL
+0x0000000000000200 SYS
+
+VOTE: mode
+0x0000000000000000 ALL
+0x0001000000000000 ANY
+0x0002000000000000 EQ
+
+VOTE
+0x00000000000000ff nor0
+
+BRA
+0x0000000000000080 U
+
+TLDS: chnls
+0x0010000000000000 RGBA
+
+TLDS
+0x0002000000000000 NODEP
+
+LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS
+0x000000000000ff00 nor8
+
+LD, ST: type
+0x0000000000000000 .U8
+0x0020000000000000 .S8
+0x0040000000000000 .U16
+0x0060000000000000 .S16
+0x0080000000000000
+0x0080000000000000 .32
+0x00a0000000000000 .64
+0x00c0000000000000 .128
+
+LD, ST: cache
+0x0100000000000000 CG
+0x0200000000000000 CS
+0x0300000000000000 CV
+0x0300000000000000 WT
+
+LDG, STG, LDS, STS, LDL, STL, LDC: type
+0x0000000000000000 .U8
+0x0001000000000000 .S8
+0x0002000000000000 .U16
+0x0003000000000000 .S16
+0x0004000000000000
+0x0004000000000000 .32
+0x0005000000000000 .64
+0x0006000000000000 .128
+
+LDG, STG: cache
+0x0000400000000000 CG
+0x0000800000000000 CI
+0x0000800000000000 CS
+0x0000c00000000000 CV
+0x0000c00000000000 WT
+
+LDL: cache
+0x0000200000000000 CI
+
+LDC: cache
+0x0000100000000000 IL
+
+LDG, STG, LDS, STS, LDL, STL, LDC
+0x0000200000000000 E
+
+LDS
+0x0000100000000000 U
+
+RED: type
+0x0000000000000000
+0x0000000000100000 .S32
+0x0000000000200000 .U64
+0x0000000000300000 .F32.FTZ.RN
+0x0000000000400000 .F16x2.FTZ.RN
+0x0000000000500000 .S64
+
+RED: mode
+0x0000000000000000 ADD
+0x0000000000800000 MIN
+0x0000000001000000 MAX
+0x0000000001800000 INC
+0x0000000002000000 DEC
+0x0000000002800000 AND
+0x0000000003000000 OR
+0x0000000003800000 XOR
+
+ATOM: type
+0x0000000000000000
+0x0002000000000000 .S32
+0x0004000000000000 .U64
+0x0006000000000000 .F32.FTZ.RN
+0x0008000000000000 .F16x2.FTZ.RN
+0x000a000000000000 .S64
+0x0002000000000000 .64
+
+ATOM, RED
+0x0001000000000000 E
+
+ATOM: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x03f0000000000000 CAS
+
+ATOMS: type
+0x0000000000000000
+0x0000000010000000 .S32
+0x0000000020000000 .U64
+0x0000000030000000 .S64
+0x0010000000000000 .64
+
+ATOMS: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x0240000000000000 CAS
+};
+
+# The existence of a capture group can map directly to an op code adjustment, or...
+# The named capture group value can map the op code adjustmemt from among several options
+our %flags;
+my (@ops, $flag);
+foreach my $line (@flags)
+{
+    if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)')
+    {
+        my $val = hex($1);
+        # named rules (op: name)
+        if ($flag)
+            { $flags{$_}{$flag}{$2} = $val foreach @ops; }
+        # simple existence check rules
+        else
+            { $flags{$_}{$2}        = $val foreach @ops; }
+    }
+    else
+    {
+        my ($ops, $name) = split ':\s*', $line;
+        @ops = split ',\s*', $ops;
+        $flag = $name;
+    }
+}
+
+sub parseInstruct
+{
+    my ($inst, $grammar) = @_;
+    return unless $inst =~ $grammar->{rule};
+    my %capData = %+;
+    return \%capData;
+}
+
+# for immediate or constant operands and a given opcode, bits 56-63 get transformed
+my %immedOps = map { $_ => 1 } qw(i20 f20 d20);
+my %immedCodes =
+(
+    0x5c => 0x64,
+    0x5b => 0x6d,
+    0x59 => 0x6b,
+    0x58 => 0x68,
+);
+my %constCodes =
+(
+    c20 => 0x10,
+    c39 => 0x08,
+);
+my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4);
+
+# just pick out the reuse code and nothing else
+sub genReuseCode
+{
+    my $capData = shift;
+    my $reuse = 0;
+    $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes;
+    return $reuse;
+}
+
+# Generate an op code from regex capture data
+# if you pass in a test array ref it will populate it with the matching capture groups
+sub genCode
+{
+    my ($op, $grammar, $capData, $test) = @_;
+
+    my $flags     = $flags{$op};
+    my $code      = $grammar->{code};
+    my $reuse     = 0;
+    my $immedCode = $immedCodes{$code >> 56};
+
+    #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I';
+
+    # process the instruction predicate (if valid for this instuction)
+    if (exists $capData->{noPred})
+    {
+        delete $capData->{noPred};
+        push @$test, 'noPred' if $test;
+    }
+    else
+    {
+        my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7;
+        push @$test, 'predNum' if $test;
+        if (exists $capData->{predNot})
+        {
+            $p |= 8;
+            push @$test, 'predNot' if $test;
+        }
+        $code ^= $p << 16;
+        delete @{$capData}{qw(predNum predNot)};
+
+    }
+    # process the register reuse flags
+    foreach my $rcode (qw(reuse1 reuse2 reuse3))
+    {
+        if (delete $capData->{$rcode})
+        {
+            $reuse |= $reuseCodes{$rcode};
+            push @$test, $rcode if $test;
+        }
+    }
+
+    foreach my $capture (keys %$capData)
+    {
+        # change the base code for immediate versions of the op
+        if (exists $immedOps{$capture})
+            { $code ^= $immedCode << 56; }
+        # change the base code for constant versions of the op
+        elsif (exists $constCodes{$capture})
+            { $code ^= $constCodes{$capture} << 56; }
+
+        # if capture group is an operand then process and add that data to code
+        if (exists $operands{$capture})
+        {
+            # don't process the r20 that comes with the r39s20 capture
+            unless ($capture eq 'r20' && exists $capData->{r39s20})
+            {
+                $code ^= $operands{$capture}->($capData->{$capture});
+                push @$test, $capture if $test;
+            }
+        }
+
+        # Add matching flags (an operand might also add/remove a flag)
+        if (exists $flags->{$capture})
+        {
+            # a named multivalue flag
+            if (ref $flags->{$capture})
+            {
+                $code ^= $flags->{$capture}{$capData->{$capture}};
+                push @$test, "$capture:$capData->{$capture}" if $test;
+            }
+            # a simple exists flag
+            else
+            {
+                $code ^= $flags->{$capture};
+                push @$test, $capture if $test;
+            }
+        }
+        elsif (!exists $operands{$capture} && !$test)
+        {
+            # Every capture group should be acted upon.  Missing one is a bug.
+            warn "UNUSED: $op: $capture: $capData->{$capture}\n";
+            warn Dumper($flags);
+        }
+    }
+
+    return $code, $reuse;
+}
+
+
+my $CtrlRe = qr'(?<ctrl>[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])';
+my $PredRe = qr'(?<pred>@!?(?<predReg>P\d)\s+)';
+my $InstRe = qr"$PredRe?(?<op>\w+)(?<rest>[^;]*;)"o;
+my $CommRe = qr'(?<comment>.*)';
+
+sub processAsmLine
+{
+    my ($line, $lineNum) = @_;
+
+    if ($line =~ m"^$CtrlRe(?<space>\s+)$InstRe$CommRe"o)
+    {
+        return {
+            lineNum => $lineNum,
+            pred    => $+{pred},
+            predReg => $+{predReg},
+            space   => $+{space},
+            op      => $+{op},
+            comment => $+{comment},
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            ctrl    => readCtrl($+{ctrl}, $line),
+        };
+    }
+    return undef;
+}
+
+sub processSassLine
+{
+    my $line = shift;
+
+    if ($line =~ m"^\s+/\*(?<num>[0-9a-f]+)\*/\s+$InstRe\s+/\* (?<code>0x[0-9a-f]+)"o)
+    {
+        return {
+            num     => hex($+{num}),
+            pred    => $+{pred},
+            op      => $+{op},
+            ins     => normalizeSpacing($+{op} . $+{rest}),
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            code    => hex($+{code}),
+        };
+    }
+    return undef;
+}
+
+sub processSassCtrlLine
+{
+    my ($line, $ctrl, $ruse) = @_;
+
+    return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)';
+
+    my $code = hex($1);
+    if (ref $ctrl)
+    {
+        push @$ctrl, ($code & 0x000000000001ffff) >> 0;
+        push @$ctrl, ($code & 0x0000003fffe00000) >> 21;
+        push @$ctrl, ($code & 0x07fffc0000000000) >> 42;
+    }
+    if (ref $ruse)
+    {
+        push @$ruse, ($code & 0x00000000001e0000) >> 17;
+        push @$ruse, ($code & 0x000003c000000000) >> 38;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+    }
+    return 1;
+}
+
+sub replaceXMADs
+{
+    my $file = shift;
+
+# XMAD.LO d, a, b, c, x;
+# ----------------------
+# XMAD.MRG x, a, b.H1, RZ;
+# XMAD d, a, b, c;
+# XMAD.PSL.CBCC d, a.H1, x.H1, d;
+# ----------------------
+# XMAD d, a, 0xffff, c;
+# XMAD.PSL d, a.H1, 0xffff, d;
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD\.LO\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<x>\w+)\s*;$CommRe/
+
+        die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s
+%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s;
+%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;',
+                @+{qw(ctrl space pred d a b c x comment)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>-?$immed|\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/
+
+        die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<c>\w+)\s*;$CommRe/
+
+        die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    #TODO: add more XMAD macros
+    return $file;
+}
+# convert extra spaces to single spacing to make our re's simplier
+sub normalizeSpacing
+{
+    my $inst = shift;
+    $inst =~ s/\t/ /g;
+    $inst =~ s/\s{2,}/ /g;
+    return $inst;
+}
+
+
+# map binary control notation on to easier to work with format.
+sub printCtrl
+{
+    my $code = shift;
+
+    my $stall = ($code & 0x0000f) >> 0;
+    my $yield = ($code & 0x00010) >> 4;
+    my $wrtdb = ($code & 0x000e0) >> 5;  # write dependency barier
+    my $readb = ($code & 0x00700) >> 8;  # read  dependency barier
+    my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier
+
+    $yield = $yield ? '-' : 'Y';
+    $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1;
+    $readb = $readb == 7 ? '-' : $readb + 1;
+    $watdb = $watdb ? sprintf('%02x', $watdb) : '--';
+
+    return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall;
+}
+sub readCtrl
+{
+    my ($ctrl, $context) = @_;
+    my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl;
+
+    $watdb = $watdb eq '--' ? 0 : hex $watdb;
+    $readb = $readb eq '-'  ? 7 : $readb - 1;
+    $wrtdb = $wrtdb eq '-'  ? 7 : $wrtdb - 1;
+    $yield = $yield eq 'y' || $yield eq 'Y'  ? 0 : 1;
+    $stall = hex $stall;
+
+    die sprintf('wait dep out of range(0x00-0x3f): %x at %s',   $watdb, $context) if $watdb != ($watdb & 0x3f);
+
+    return
+        $watdb << 11 |
+        $readb << 8  |
+        $wrtdb << 5  |
+        $yield << 4  |
+        $stall << 0;
+}
+
+sub getRegNum
+{
+    my ($regMap, $regName) = @_;
+
+    return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName};
+}
+
+sub getVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r0} or return;
+
+    return if $regName eq 'RZ';
+
+    if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+3);
+        }
+        confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4;
+        return @{$vectors->{$regName}};
+    }
+    return $regName;
+}
+
+sub getAddrVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r8} or return;
+
+    return if $regName eq 'RZ';
+
+    if (exists $capData->{E})
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        print Dumper($vectors) unless exists $vectors->{$regName};
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    return $regName;
+}
+
+__END__
+
+
+
diff --git a/Assembler/PascalAs/blib/lib/auto/MaxAs/MaxAs/.exists b/Assembler/PascalAs/blib/lib/auto/MaxAs/MaxAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/lib/auto/PascalAs/PascalAs/.exists b/Assembler/PascalAs/blib/lib/auto/PascalAs/PascalAs/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/man1/.exists b/Assembler/PascalAs/blib/man1/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/man3/.exists b/Assembler/PascalAs/blib/man3/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/man3/MaxAs::MaxAs.3pm b/Assembler/PascalAs/blib/man3/MaxAs::MaxAs.3pm
new file mode 100644
index 0000000..9f95fff
--- /dev/null
+++ b/Assembler/PascalAs/blib/man3/MaxAs::MaxAs.3pm
@@ -0,0 +1,170 @@
+.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.13)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is turned on, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.ie \nF \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    nr % 0
+.    rr F
+.\}
+.el \{\
+.    de IX
+..
+.\}
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "MaxAs::MaxAs 3"
+.TH MaxAs::MaxAs 3 "2016-02-04" "perl v5.10.1" "User Contributed Perl Documentation"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+MaxAs::MaxAs \- Assembler for NVIDIA Maxwell architecture
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\&    maxas.pl [opts]
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+See the documentation at: https://github.com/NervanaSystems/maxas
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+See the documentation at: https://github.com/NervanaSystems/maxas
+.SH "AUTHOR"
+.IX Header "AUTHOR"
+Scott Gray, <sgray@nervanasys.com<gt>
+.SH "COPYRIGHT AND LICENSE"
+.IX Header "COPYRIGHT AND LICENSE"
+The \s-1MIT\s0 License (\s-1MIT\s0)
+.PP
+Copyright (c) 2014 Scott Gray
+.PP
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the \*(L"Software\*(R"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+.PP
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+.PP
+\&\s-1THE\s0 \s-1SOFTWARE\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R", \s-1WITHOUT\s0 \s-1WARRANTY\s0 \s-1OF\s0 \s-1ANY\s0 \s-1KIND\s0, \s-1EXPRESS\s0 \s-1OR\s0
+\&\s-1IMPLIED\s0, \s-1INCLUDING\s0 \s-1BUT\s0 \s-1NOT\s0 \s-1LIMITED\s0 \s-1TO\s0 \s-1THE\s0 \s-1WARRANTIES\s0 \s-1OF\s0 \s-1MERCHANTABILITY\s0,
+\&\s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0 \s-1AND\s0 \s-1NONINFRINGEMENT\s0. \s-1IN\s0 \s-1NO\s0 \s-1EVENT\s0 \s-1SHALL\s0 \s-1THE\s0
+\&\s-1AUTHORS\s0 \s-1OR\s0 \s-1COPYRIGHT\s0 \s-1HOLDERS\s0 \s-1BE\s0 \s-1LIABLE\s0 \s-1FOR\s0 \s-1ANY\s0 \s-1CLAIM\s0, \s-1DAMAGES\s0 \s-1OR\s0 \s-1OTHER\s0
+\&\s-1LIABILITY\s0, \s-1WHETHER\s0 \s-1IN\s0 \s-1AN\s0 \s-1ACTION\s0 \s-1OF\s0 \s-1CONTRACT\s0, \s-1TORT\s0 \s-1OR\s0 \s-1OTHERWISE\s0, \s-1ARISING\s0 \s-1FROM\s0,
+\&\s-1OUT\s0 \s-1OF\s0 \s-1OR\s0 \s-1IN\s0 \s-1CONNECTION\s0 \s-1WITH\s0 \s-1THE\s0 \s-1SOFTWARE\s0 \s-1OR\s0 \s-1THE\s0 \s-1USE\s0 \s-1OR\s0 \s-1OTHER\s0 \s-1DEALINGS\s0 \s-1IN\s0
+\&\s-1THE\s0 \s-1SOFTWARE\s0.
diff --git a/Assembler/PascalAs/blib/man3/PascalAs::PascalAs.3pm b/Assembler/PascalAs/blib/man3/PascalAs::PascalAs.3pm
new file mode 100644
index 0000000..22de6a2
--- /dev/null
+++ b/Assembler/PascalAs/blib/man3/PascalAs::PascalAs.3pm
@@ -0,0 +1,117 @@
+.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29)
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+.    ds C`
+.    ds C'
+'br\}
+.\"
+.\" Escape single quotes in literal strings from groff's Unicode transform.
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\"
+.\" If the F register is turned on, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
+..
+.nr rF 0
+.if \n(.g .if rF .nr rF 1
+.if (\n(rF:(\n(.g==0)) \{
+.    if \nF \{
+.        de IX
+.        tm Index:\\$1\t\\n%\t"\\$2"
+..
+.        if !\nF==2 \{
+.            nr % 0
+.            nr F 2
+.        \}
+.    \}
+.\}
+.rr rF
+.\" ========================================================================
+.\"
+.IX Title "PascalAs::PascalAs 3pm"
+.TH PascalAs::PascalAs 3pm "2018-11-05" "perl v5.22.1" "User Contributed Perl Documentation"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.if n .ad l
+.nh
+.SH "NAME"
+PascalAs::PascalAs \- Assembler for NVIDIA Maxwell architecture
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\&    Pascalas.pl [opts]
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+See the documentation at: https://github.com/NervanaSystems/pascalas
+.SH "SEE ALSO"
+.IX Header "SEE ALSO"
+See the documentation at: https://github.com/NervanaSystems/pascalas
+.SH "AUTHOR"
+.IX Header "AUTHOR"
+Scott Gray, <sgray@nervanasys.com<gt>
+.SH "COPYRIGHT AND LICENSE"
+.IX Header "COPYRIGHT AND LICENSE"
+The \s-1MIT\s0 License (\s-1MIT\s0)
+.PP
+Copyright (c) 2014 Scott Gray
+.PP
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the \*(L"Software\*(R"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+.PP
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+.PP
+\&\s-1THE SOFTWARE IS PROVIDED \*(L"AS IS\*(R", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.\s0
diff --git a/Assembler/PascalAs/blib/script/.exists b/Assembler/PascalAs/blib/script/.exists
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/blib/script/maxas.pl b/Assembler/PascalAs/blib/script/maxas.pl
new file mode 100755
index 0000000..91cfa30
--- /dev/null
+++ b/Assembler/PascalAs/blib/script/maxas.pl
@@ -0,0 +1,289 @@
+#!/usr/bin/perl
+
+eval 'exec /usr/bin/perl  -S $0 ${1+"$@"}'
+    if 0; # not running under some shell
+use strict;
+use MaxAs::Cubin;
+use MaxAs::MaxAs;
+use Data::Dumper;
+use File::Spec;
+
+require 5.10.0;
+
+$Data::Dumper::Sortkeys = 1;
+
+my $mode = shift;
+
+# List cubin contents
+if ($mode =~ /^\-?\-l/i)
+{
+    my $cubinFile = shift or usage();
+
+    my $cubin = MaxAs::Cubin->new($cubinFile);
+
+    my $arch    = $cubin->arch;
+    my $class   = $cubin->class;
+    my $asize   = $cubin->address_size;
+    my $kernels = $cubin->listKernels;
+    my $symbols = $cubin->listSymbols;
+
+    printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize;
+
+    foreach my $ker (sort keys %$kernels)
+    {
+        printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)};
+    }
+    foreach my $sym (sort keys %$symbols)
+    {
+        printf "Symbol: %s\n", $sym;
+    }
+}
+# Test that the assembler can reproduce the op codes this cubin or sass contains
+elsif ($mode =~ /^\-?\-t/i)
+{
+    my $reg  = shift if $ARGV[0] =~ /^\-?\-r/i;
+    my $all  = shift if $ARGV[0] =~ /^\-?\-a/i;
+    my $file = shift or usage();
+    my $fh;
+    # sass file
+    if (-T $file)
+    {
+        open $fh, $file or die "$file: $!";
+    }
+    # cubin file
+    else
+    {
+        my $cubin = MaxAs::Cubin->new($file);
+        my $arch  = $cubin->arch;
+
+        open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!";
+        my $first = <$fh>;
+        if ($first =~ /cuobjdump fatal/)
+        {
+            print $first;
+            exit(1);
+        }
+    }
+    exit(MaxAs::MaxAs::Test($fh, $reg, $all) ? 1 : 0);
+}
+# Extract an asm file containing the desired kernel
+elsif ($mode =~ /^\-?\-e/i)
+{
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $cubinFile = shift or usage();
+    my $asmFile   = shift;
+    my $cubin     = MaxAs::Cubin->new($cubinFile);
+    my $arch      = $cubin->arch;
+    my $kernels   = $cubin->listKernels;
+
+    #default the kernel name if not specified.
+    $kernelName ||= (sort keys %$kernels)[0];
+
+    my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName";
+
+    open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_50 -sass -fun $kernelName $cubinFile: $!";
+    my $first = <$in>;
+    if ($first =~ /cuobjdump fatal/)
+    {
+        print $first;
+        exit(1);
+    }
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n";
+
+    print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt));
+
+    print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n";
+
+    print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params};
+
+    print $out "#\n# Instructions:\n\n";
+
+    MaxAs::MaxAs::Extract($in, $out, $kernel->{Params});
+
+    close $out if $asmFile;
+    close $in;
+}
+# Extract a kernel from a sass dump
+elsif ($mode =~ /^\-?\-s/i)
+{
+    my $sassFile  = shift or usage();
+    my $asmFile   = shift;
+
+    open my $in, $sassFile or die "$sassFile: $!";
+
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    MaxAs::MaxAs::Extract($in, $out, []);
+
+    close $out if $asmFile;
+    close $in;
+}
+# Insert the kernel asm back into the cubin:
+elsif ($mode =~ /^\-?\-i/i)
+{
+    my $nowarn;
+    if ($ARGV[0] =~ /^\-?\-w/i)
+    {
+        $nowarn = shift;
+    }
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $noReuse   = shift if $ARGV[0] =~ /^\-?\-n/i;
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';"
+    }
+
+    my $asmFile   = shift or usage();
+    my $cubinFile = shift or usage();
+    my $newCubin  = shift || $cubinFile;
+
+    my $file;
+    if (open my $fh, $asmFile)
+    {
+        local $/;
+        $file = <$fh>;
+        close $fh;
+    }
+    else { die "$asmFile: $!" }
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    # extract the kernel name from the file
+    ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName;
+    die "asm file missing kernel name or is badly formatted" unless $kernelName;
+
+    my $kernel = MaxAs::MaxAs::Assemble($file, $include, !$noReuse, $nowarn);
+
+    my $cubin  = MaxAs::Cubin->new($cubinFile);
+    $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName";
+
+    $cubin->modifyKernel(%$kernel);
+
+    $cubin->write($newCubin);
+
+    printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n",
+        @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)};
+
+}
+# Preprocessing:
+elsif ($mode =~ /^\-?\-p/i)
+{
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';";
+    }
+    my $debug     = shift if $ARGV[0] =~ /^\-?\-d/i;
+    my $asmFile   = shift or usage();
+    my $asmFile2  = shift;
+
+    die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2;
+
+    open my $fh,  $asmFile or die "$asmFile: $!";
+    local $/;
+    my $file = <$fh>;
+    close $fh;
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    if ($asmFile2)
+    {
+        open $fh, ">$asmFile2" or die "$asmFile2: $!";
+    }
+    else
+    {
+        $fh = \*STDOUT;
+    }
+    print $fh MaxAs::MaxAs::Preprocess($file, $include, $debug);
+    close $fh;
+}
+# get version information
+elsif ($mode =~ /^\-?\-v/i)
+{
+    print "$MaxAs::MaxAs::VERSION\n";
+}
+else
+{
+    print "$mode\n";
+    usage();
+}
+
+exit(0);
+
+
+
+sub usage
+{
+    print <<EOF;
+Usage:
+
+  List kernels and symbols:
+
+    maxas.pl --list|-l <cubin_file>
+
+  Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes.
+  Also useful for extending the missing grammar rules.  Defaults to only showing failures without --all.
+  With the --reg flag it will show register bank conflicts not hidden by reuse flags.
+
+    maxas.pl --test|-t [--reg|-r] [--all|-a] <cubin_file | cuobjdump_sass_file>
+
+  Extract a single kernel into an asm file from a cubin.
+  Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin.
+
+    maxas.pl --extract|-e [--kernel|-k kernel_name] <cubin_file> [asm_file]
+
+  Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes.
+  Include the debug flag to print out detailed scheduler info.
+
+    maxas.pl --pre|-p [--debug|-d] <asm_file> [new_asm_file]
+
+  Insert the kernel asm back into the cubin.  Overwrite existing or create new cubin.
+  Optionally you can skip register reuse flag auto insertion.  This allows you to observe
+  performance without any reuse or you can use it to set the flags manually in your sass.
+
+    maxas.pl --insert|-i [--noreuse|-n] <asm_file> <cubin_file> [new_cubin_file]
+
+  Display version information and exit:
+
+    maxas.pl --version|-v
+
+EOF
+    exit(1);
+}
+
+__END__
diff --git a/Assembler/PascalAs/blib/script/pascalas.pl b/Assembler/PascalAs/blib/script/pascalas.pl
new file mode 100755
index 0000000..a0f1372
--- /dev/null
+++ b/Assembler/PascalAs/blib/script/pascalas.pl
@@ -0,0 +1,286 @@
+#!/usr/bin/perl
+use strict;
+use PascalAs::Cubin;
+use PascalAs::PascalAs;
+use Data::Dumper;
+use File::Spec;
+
+require 5.10.0;
+
+$Data::Dumper::Sortkeys = 1;
+
+my $mode = shift;
+
+# List cubin contents
+if ($mode =~ /^\-?\-l/i)
+{
+    my $cubinFile = shift or usage();
+
+    my $cubin = PascalAs::Cubin->new($cubinFile);
+
+    my $arch    = $cubin->arch;
+    my $class   = $cubin->class;
+    my $asize   = $cubin->address_size;
+    my $kernels = $cubin->listKernels;
+    my $symbols = $cubin->listSymbols;
+
+    printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize;
+
+    foreach my $ker (sort keys %$kernels)
+    {
+        printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)};
+    }
+    foreach my $sym (sort keys %$symbols)
+    {
+        printf "Symbol: %s\n", $sym;
+    }
+}
+# Test that the assembler can reproduce the op codes this cubin or sass contains
+elsif ($mode =~ /^\-?\-t/i)
+{
+    my $reg  = shift if $ARGV[0] =~ /^\-?\-r/i;
+    my $all  = shift if $ARGV[0] =~ /^\-?\-a/i;
+    my $file = shift or usage();
+    my $fh;
+    # sass file
+    if (-T $file)
+    {
+        open $fh, $file or die "$file: $!";
+    }
+    # cubin file
+    else
+    {
+        my $cubin = PascalAs::Cubin->new($file);
+        my $arch  = $cubin->arch;
+
+        open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!";
+        my $first = <$fh>;
+        if ($first =~ /cuobjdump fatal/)
+        {
+            print $first;
+            exit(1);
+        }
+    }
+    exit(PascalAs::PascalAs::Test($fh, $reg, $all) ? 1 : 0);
+}
+# Extract an asm file containing the desired kernel
+elsif ($mode =~ /^\-?\-e/i)
+{
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $cubinFile = shift or usage();
+    my $asmFile   = shift;
+    my $cubin     = PascalAs::Cubin->new($cubinFile);
+    my $arch      = $cubin->arch;
+    my $kernels   = $cubin->listKernels;
+
+    #default the kernel name if not specified.
+    $kernelName ||= (sort keys %$kernels)[0];
+
+    my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName";
+
+    open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_60 -sass -fun $kernelName $cubinFile: $!";
+    my $first = <$in>;
+    if ($first =~ /cuobjdump fatal/)
+    {
+        print $first;
+        exit(1);
+    }
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n";
+
+    print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt));
+
+    print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n";
+
+    print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params};
+
+    print $out "#\n# Instructions:\n\n";
+
+    PascalAs::PascalAs::Extract($in, $out, $kernel->{Params});
+
+    close $out if $asmFile;
+    close $in;
+}
+# Extract a kernel from a sass dump
+elsif ($mode =~ /^\-?\-s/i)
+{
+    my $sassFile  = shift or usage();
+    my $asmFile   = shift;
+
+    open my $in, $sassFile or die "$sassFile: $!";
+
+    my $out;
+    if ($asmFile)
+    {
+        open $out, ">$asmFile" or die "$asmFile: $!";
+    }
+    else
+    {
+        $out = \*STDOUT;
+    }
+
+    PascalAs::PascalAs::Extract($in, $out, []);
+
+    close $out if $asmFile;
+    close $in;
+}
+# Insert the kernel asm back into the cubin:
+elsif ($mode =~ /^\-?\-i/i)
+{
+    my $nowarn;
+    if ($ARGV[0] =~ /^\-?\-w/i)
+    {
+        $nowarn = shift;
+    }
+    my $kernelName;
+    if ($ARGV[0] =~ /^\-?\-k/i)
+    {
+        shift;
+        $kernelName = shift or usage();
+    }
+    my $noReuse   = shift if $ARGV[0] =~ /^\-?\-n/i;
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package PascalAs::PascalAs::CODE; our \$$name = '$value';"
+    }
+
+    my $asmFile   = shift or usage();
+    my $cubinFile = shift or usage();
+    my $newCubin  = shift || $cubinFile;
+
+    my $file;
+    if (open my $fh, $asmFile)
+    {
+        local $/;
+        $file = <$fh>;
+        close $fh;
+    }
+    else { die "$asmFile: $!" }
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    # extract the kernel name from the file
+    ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName;
+    die "asm file missing kernel name or is badly formatted" unless $kernelName;
+
+    my $kernel = PascalAs::PascalAs::Assemble($file, $include, !$noReuse, $nowarn);
+
+    my $cubin  = PascalAs::Cubin->new($cubinFile);
+    $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName";
+
+    $cubin->modifyKernel(%$kernel);
+
+    $cubin->write($newCubin);
+
+    printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n",
+        @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)};
+
+}
+# Preprocessing:
+elsif ($mode =~ /^\-?\-p/i)
+{
+    while ($ARGV[0] =~ /^\-?\-D(\w+)/)
+    {
+        shift;
+        my $name  = $1;
+        my $value = shift;
+        eval "package PascalAs::PascalAs::CODE; our \$$name = '$value';";
+    }
+    my $debug     = shift if $ARGV[0] =~ /^\-?\-d/i;
+    my $asmFile   = shift or usage();
+    my $asmFile2  = shift;
+
+    die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2;
+
+    open my $fh,  $asmFile or die "$asmFile: $!";
+    local $/;
+    my $file = <$fh>;
+    close $fh;
+
+    my ($vol,$dir) = File::Spec->splitpath($asmFile);
+    my $include = [$vol, $dir];
+
+    if ($asmFile2)
+    {
+        open $fh, ">$asmFile2" or die "$asmFile2: $!";
+    }
+    else
+    {
+        $fh = \*STDOUT;
+    }
+    print $fh PascalAs::PascalAs::Preprocess($file, $include, $debug);
+    close $fh;
+}
+# get version information
+elsif ($mode =~ /^\-?\-v/i)
+{
+    print "$PascalAs::PascalAs::VERSION\n";
+}
+else
+{
+    print "$mode\n";
+    usage();
+}
+
+exit(0);
+
+
+
+sub usage
+{
+    print <<EOF;
+Usage:
+
+  List kernels and symbols:
+
+    pascalas.pl --list|-l <cubin_file>
+
+  Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes.
+  Also useful for extending the missing grammar rules.  Defaults to only showing failures without --all.
+  With the --reg flag it will show register bank conflicts not hidden by reuse flags.
+
+    pascalas.pl --test|-t [--reg|-r] [--all|-a] <cubin_file | cuobjdump_sass_file>
+
+  Extract a single kernel into an asm file from a cubin.
+  Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin.
+
+    pascalas.pl --extract|-e [--kernel|-k kernel_name] <cubin_file> [asm_file]
+
+  Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes.
+  Include the debug flag to print out detailed scheduler info.
+
+    pascalas.pl --pre|-p [--debug|-d] <asm_file> [new_asm_file]
+
+  Insert the kernel asm back into the cubin.  Overwrite existing or create new cubin.
+  Optionally you can skip register reuse flag auto insertion.  This allows you to observe
+  performance without any reuse or you can use it to set the flags manually in your sass.
+
+    pascalas.pl --insert|-i [--noreuse|-n] <asm_file> <cubin_file> [new_cubin_file]
+
+  Display version information and exit:
+
+    pascalas.pl --version|-v
+
+EOF
+    exit(1);
+}
+
+__END__
diff --git a/Assembler/PascalAs/cpanfile b/Assembler/PascalAs/cpanfile
new file mode 100644
index 0000000..e8281c5
--- /dev/null
+++ b/Assembler/PascalAs/cpanfile
@@ -0,0 +1,4 @@
+requires 'perl', '5.10.0';
+
+requires 'Carp', '1.29';
+requires 'Data::Dumper', '2.145';
diff --git a/Assembler/PascalAs/lib/PascalAs/Cubin.pm b/Assembler/PascalAs/lib/PascalAs/Cubin.pm
new file mode 100644
index 0000000..10bf9a8
--- /dev/null
+++ b/Assembler/PascalAs/lib/PascalAs/Cubin.pm
@@ -0,0 +1,686 @@
+package PascalAs::Cubin;
+
+use strict;
+use Data::Dumper;
+
+my @Elf32_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    L   entry
+    L   phOffset
+    L   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf64_Hdr = qw(
+    H8  magic
+    C   fileClass
+    C   encoding
+    C   fileVersion
+    H18 padding
+    S   type
+    S   machine
+    L   version
+    Q   entry
+    Q   phOffset
+    Q   shOffset
+    L   flags
+    S   ehSize
+    S   phEntSize
+    S   phNum
+    S   shEntSize
+    S   shNum
+    S   shStrIndx
+);
+my @Elf32_PrgHdr = qw(
+    L   type
+    L   offset
+    L   vaddr
+    L   paddr
+    L   fileSize
+    L   memSize
+    L   flags
+    L   align
+);
+my @Elf64_PrgHdr = qw(
+    L   type
+    L   flags
+    Q   offset
+    Q   vaddr
+    Q   paddr
+    Q   fileSize
+    Q   memSize
+    Q   align
+);
+my @Elf32_SecHdr = qw(
+    L   name
+    L   type
+    L   flags
+    L   addr
+    L   offset
+    L   size
+    L   link
+    L   info
+    L   align
+    L   entSize
+);
+my @Elf64_SecHdr = qw(
+    L   name
+    L   type
+    Q   flags
+    Q   addr
+    Q   offset
+    Q   size
+    L   link
+    L   info
+    Q   align
+    Q   entSize
+);
+my @Elf32_SymEnt = qw(
+    L   name
+    L   value
+    L   size
+    C   info
+    C   other
+    S   shIndx
+);
+my @Elf64_SymEnt = qw(
+    L   name
+    C   info
+    C   other
+    S   shIndx
+    Q   value
+    Q   size
+);
+my @symBind = qw(LOCAL GLOBAL WEAK);
+
+# Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C)
+my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC);
+
+$elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr;
+$prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr;
+$secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr;
+$symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt;
+
+$elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr;
+$prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr;
+$secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr;
+$symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt;
+
+$elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr    ];
+$prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ];
+$secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ];
+$symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ];
+
+$elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr    ];
+$prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ];
+$secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ];
+$symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ];
+
+# Load a cubin ELF file
+sub new
+{
+    my ($package, $file) = @_;
+
+    my $cubin = bless { fileName => $file }, $package;
+
+    open my $fh, $file or die "$file: $!";
+    binmode($fh);
+
+    # Read in assuming 32 bit header
+    my $data;
+    read $fh, $data, 0x34;
+    my $elfHdr = $cubin->{elfHdr} = {};
+    @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data;
+
+    # 1: 32bit, 2: 64bit
+    my $class = $elfHdr->{fileClass};
+
+    # re-read in with 64 bit header if needed
+    if ($class == 2)
+    {
+        seek $fh, 0, 0;
+        read $fh, $data, 0x46;
+        @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data;
+
+        $cubin->{Class} = 64;
+    }
+    else
+    {
+        $cubin->{Class} = 32;
+    }
+
+    # verify sm_60 cubin
+    #$cubin->{Arch} = $elfHdr->{flags} & 0xFF;
+    #die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50;
+
+    $cubin->{Arch} = "60";
+    die "Cubin not in sm_60. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} != 60;
+    $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32;
+
+    # Read in Program Headers
+    seek $fh, $elfHdr->{phOffset}, 0;
+    foreach (1 .. $elfHdr->{phNum})
+    {
+        read $fh, $data, $elfHdr->{phEntSize};
+
+        my %prgHdr = (Indx => $_ - 1);
+        @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data;
+        push @{$cubin->{prgHdrs}}, \%prgHdr;
+    }
+
+    # Read in Section Headers
+    seek $fh, $elfHdr->{shOffset}, 0;
+    foreach (1 .. $elfHdr->{shNum})
+    {
+        read $fh, $data, $elfHdr->{shEntSize};
+
+        my %secHdr = (Indx => $_ - 1);
+        @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data;
+        push @{$cubin->{secHdrs}}, \%secHdr;
+    }
+
+    # Read in Section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $data = '';
+        # Skip sections with no data (type NULL or NOBITS)
+        if ($secHdr->{size} && $secHdr->{type} != 8)
+        {
+            seek $fh, $secHdr->{offset}, 0;
+            read $fh, $data, $secHdr->{size};
+        }
+        # Convert string tables to maps
+        if ($secHdr->{type} == 3) # STRTAB
+        {
+            my $strTab = $secHdr->{StrTab} = {};
+            my $indx   = 0;
+            foreach my $str (split "\0", $data)
+            {
+                $strTab->{$indx} = $str;
+                $indx += 1 + length($str);
+            }
+        }
+        # Read in Symbol data
+        if ($secHdr->{type} == 2) # SYMTAB
+        {
+            my $offset = 0;
+            while ($offset < $secHdr->{size})
+            {
+                my $symEnt = {};
+                @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize});
+                $offset += $secHdr->{entSize};
+
+                push @{$secHdr->{SymTab}}, $symEnt;
+            }
+        }
+        # Cache raw data for further processing and writing
+        $secHdr->{Data} = unpack 'H*', $data;
+    }
+    close $fh;
+
+    # Update section headers with their names.  Map names directly to headers.
+    my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab};
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        $secHdr->{Name} = $shStrTab->{$secHdr->{name}};
+        $cubin->{$secHdr->{Name}} = $secHdr;
+    }
+
+    # Update symbols with their names
+    # For the Global functions, extract kernel meta data
+    # Populate the kernel hash
+    my $strTab = $cubin->{'.strtab'}{StrTab};
+    foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}})
+    {
+        $symEnt->{Name} = $strTab->{$symEnt->{name}};
+
+        # Attach symbol to section
+        my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}];
+        $secHdr->{SymbolEnt} = $symEnt;
+
+        # Look for symbols tagged FUNC
+        if (($symEnt->{info} & 0x0f) == 0x02)
+        {
+            # Create a hash of kernels for output
+            my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr;
+
+            # Extract local/global/weak binding info
+            $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4];
+
+            # Extract the kernel instructions
+            $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ];
+
+            # Extract the max barrier resource identifier used and add 1. Should be 0-16.
+            # If a register is used as a barrier resource id, then this value is the max of 16.
+            $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20;
+
+            # Extract the number of allocated registers for this kernel.
+            $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24;
+
+            # Extract the size of shared memory this kernel uses.
+            my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"};
+            $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0;
+
+            # Attach constant0 section
+            $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"};
+
+            # Extract the kernel parameter data.
+            my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"};
+            if ($paramSec)
+            {
+                # Extract raw param data
+                my @data = unpack "L*", pack "H*", $paramSec->{Data};
+
+                $paramSec->{ParamData} = \@data;
+                $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ];
+
+                # Find the first param delimiter
+                my $idx = 0;
+                $idx++ while $idx < @data && $data[$idx] != 0x00080a04;
+
+                my $first = $data[$idx+2] & 0xFFFF;
+                #my $size  = $data[$idx+2] >> 16;
+                $idx += 4;
+
+                my @params;
+                while ($idx < @data && $data[$idx] == 0x000c1704)
+                {
+                    # Get the ordinal, offset, size and pointer alignment for each param
+                    my $ord    = $data[$idx+2] & 0xFFFF;
+                    my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16);
+                    my $psize  = $data[$idx+3] >> 18;
+                    my $align  = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0;
+                    unshift @params, "$ord:$offset:$psize:$align";
+                    $idx += 4;
+                }
+                my @staticParams = @data[0 .. ($idx-1)];
+
+                my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize);
+                while ($idx < @data)
+                {
+                    my $code = $data[$idx] & 0xffff;
+                    my $size = $data[$idx] >> 16;
+                    $idx++;
+
+                    # EIATTR_MAXREG_COUNT
+                    if ($code == 0x1b03)
+                    {
+                        $maxregCount = $size;
+                    }
+                    # EIATTR_S2RCTAID_INSTR_OFFSETS
+                    elsif ($code == 0x1d04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @ctaidOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_EXIT_INSTR_OFFSETS
+                    elsif ($code == 0x1c04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @exitOffsets, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CTAIDZ_USED
+                    elsif ($code == 0x0401)
+                    {
+                        $ctaidzUsed = 1;
+                    }
+                    # EIATTR_REQNTID
+                    elsif ($code == 0x1004)
+                    {
+                        while ($size > 0)
+                        {
+                            push @reqntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_MAX_THREADS
+                    elsif ($code == 0x0504)
+                    {
+                        while ($size > 0)
+                        {
+                            push @maxntid, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    # EIATTR_CRS_STACK_SIZE
+                    elsif ($code == 0x1e04)
+                    {
+                        while ($size > 0)
+                        {
+                            push @stackSize, $data[$idx++];
+                            $size -= 4;
+                        }
+                    }
+                    else
+                    {
+                        printf "Unknown Code 0x%02x (size:%d)\n", $code, $size;
+                    }
+                }
+                $kernelSec->{Params}   = \@params;
+                $kernelSec->{ParamCnt} = scalar @params;
+
+                $paramSec->{StaticParams} = \@staticParams;
+                $paramSec->{MAXREG_COUNT} = $maxregCount;
+                $paramSec->{ExitOffsets}  = \@exitOffsets;
+                $paramSec->{CTAIDOffsets} = \@ctaidOffsets;
+                $paramSec->{CTAIDZUsed}   = $ctaidzUsed;
+                $paramSec->{REQNTID}      = \@reqntid;
+                $paramSec->{MAXNTID}      = \@maxntid;
+                $paramSec->{STACKSIZE}    = \@stackSize;
+            }
+            # print Dumper($paramSec);
+            # exit();
+        }
+        # Note GLOBALs found in this cubin
+        elsif (($symEnt->{info} & 0x10) == 0x10)
+        {
+            $cubin->{Symbols}{$symEnt->{Name}} = $symEnt;
+        }
+    }
+
+    # print "phOffset: $elfHdr->{phOffset}\n";
+    # print "shOffset: $elfHdr->{shOffset}\n";
+    # foreach my $secHdr (@{$cubin->{secHdrs}})
+    # {
+    #     print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n";
+    # }
+    # my $p = 0;
+    # foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    # {
+    #     print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n";
+    #     $p++;
+    # }
+    # exit();
+
+    # print Dumper($cubin->{prgHdrs});
+    # exit();
+    return $cubin;
+}
+sub class
+{
+    return shift()->{Class};
+}
+sub arch
+{
+    return shift()->{Arch};
+}
+sub address_size
+{
+    return shift()->{AddressSize};
+}
+sub listKernels
+{
+    return shift()->{Kernels};
+}
+sub listSymbols
+{
+    return shift()->{Symbols};
+}
+sub getKernel
+{
+    my ($cubin, $kernel) = @_;
+    return $cubin->{Kernels}{$kernel};
+}
+
+sub modifyKernel
+{
+    my ($cubin, %params) = @_;
+
+    my $kernelSec    = $params{Kernel};
+    my $newReg       = $params{RegCnt};
+    my $newBar       = $params{BarCnt};
+    my $exitOffsets  = $params{ExitOffsets};
+    my $ctaidOffsets = $params{CTAIDOffsets};
+    my $ctaidzUsed   = $params{CTAIDZUsed};
+    my $newData      = $params{KernelData};
+    my $newSize      = @$newData * 8;
+
+    die "255 register max" if $newReg > 255;
+    die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63;
+    die "16 is max barrier count" if $newBar > 16;
+
+    my $paramSec    = $kernelSec->{ParamSec};
+    my $kernelName  = $kernelSec->{SymbolEnt}{Name};
+    my $maxregCount = $paramSec->{MAXREG_COUNT};
+    my $stackSize   = $paramSec->{STACKSIZE};
+
+    # update the kernel
+    $kernelSec->{KernelData} = $newData;
+    $kernelSec->{Data}       = unpack "H*", pack "Q*", @$newData;
+
+    if ($newReg != $kernelSec->{RegCnt})
+    {
+        print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n";
+        $kernelSec->{RegCnt} = $newReg;
+        $kernelSec->{info}  &= ~0xff000000;
+        $kernelSec->{info}  |= $newReg << 24;
+    }
+    if ($newBar != $kernelSec->{BarCnt})
+    {
+        print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n";
+        $kernelSec->{BarCnt} = $newBar;
+        $kernelSec->{flags} &= ~0x01f00000;
+        $kernelSec->{flags} |=  $newBar << 20;
+    }
+
+    my @paramData = @{$paramSec->{StaticParams}};
+
+    if (defined $maxregCount)
+    {
+        push @paramData, ($maxregCount << 16) | 0x1b03;
+    }
+
+    my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets;
+    my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}};
+
+    if ($newCTAIDs ne $oldCTAIDs)
+    {
+        print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n";
+    }
+    if (@$ctaidOffsets)
+    {
+        push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04;
+        push @paramData, @$ctaidOffsets;
+    }
+
+    my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets;
+    my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}};
+
+    if ($newExits ne $oldExits)
+    {
+        print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n";
+    }
+    if (@$exitOffsets)
+    {
+        push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04;
+        push @paramData, @$exitOffsets;
+    }
+
+    if ($ctaidzUsed != $paramSec->{CTAIDZUsed})
+    {
+        print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n";
+    }
+    if ($ctaidzUsed)
+    {
+        push @paramData, 0x0401;
+    }
+
+    if (@{$paramSec->{REQNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004;
+        push @paramData, @{$paramSec->{REQNTID}};
+    }
+    if (@{$paramSec->{MAXNTID}})
+    {
+        push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504;
+        push @paramData, @{$paramSec->{MAXNTID}};
+    }
+
+    if (@$stackSize)
+    {
+        push @paramData, (scalar(@$stackSize) << 18) | 0x1e04;
+        push @paramData, @$stackSize;
+    }
+
+    my $newParamSize  = scalar(@paramData)*4;
+    $paramSec->{Data} = unpack "H*", pack "L*", @paramData;
+    if ($newParamSize != $paramSec->{size})
+    {
+        print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n";
+        $cubin->updateSize($paramSec, $newParamSize);
+    }
+
+    if ($newSize != $kernelSec->{size})
+    {
+        print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n";
+        $cubin->updateSize($kernelSec, $newSize, 1);
+    }
+}
+
+sub updateSize
+{
+    my ($cubin, $sec, $newSize, $updatePrgSize) = @_;
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # update section header
+    my $delta = $newSize - $sec->{size};
+    $sec->{size} = $newSize;
+
+    # update symtab section
+    if ($sec->{SymbolEnt})
+    {
+        $sec->{SymbolEnt}{size} = $newSize;
+        my $symSection = $cubin->{'.symtab'};
+        $symSection->{Data} = '';
+        foreach my $symEnt (@{$symSection->{SymTab}})
+        {
+            $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}};
+        }
+    }
+
+    my $pos = $elfHdr->{ehSize};
+    my %sizeMap;
+
+    # update section header offsets
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # skip first header
+        next if $secHdr->{align} == 0;
+
+        # NOBITS data sections are size 0
+        my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size};
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pos += $secHdr->{align} - $pad;
+        }
+        # map old offset to new
+        $sizeMap{$secHdr->{offset}} = $pos;
+
+        # update offset
+        $secHdr->{offset} = $pos;
+
+        # advance position by size
+        $pos += $size;
+    }
+
+    # compute total section header size
+    my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset};
+
+    # map old offset to new
+    $sizeMap{$elfHdr->{shOffset}} = $pos;
+    $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize;
+
+    $elfHdr->{shOffset} = $pos;
+    $elfHdr->{phOffset} = $pos + $shSize;
+
+    # update program header offsets and sizes
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        # Not sure how best to adjust these so just assume they'll track other offsets.
+        $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}};
+
+        # If the kernel sizes changes, also update the associated ProgramHeader.
+        # Note that this size is the kernel size plus any constant section sizes.
+        if ($updatePrgSize && $prgHdr->{type} == 1 &&
+            $sec->{offset} >= $prgHdr->{offset} &&
+            $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta)
+        {
+            $prgHdr->{fileSize} += $delta;
+            $prgHdr->{memSize}  += $delta;
+        }
+    }
+}
+
+# Write out the cubin after modifying it.
+sub write
+{
+    my ($cubin, $file) = @_;
+
+    open my $fh, ">$file" or die "Error: could not open $file for writing: $!";
+    binmode($fh);
+
+    my $elfHdr = $cubin->{elfHdr};
+    my $class  = $elfHdr->{fileClass};
+
+    # write elf header
+    print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}};
+    my $pos = $elfHdr->{ehSize};
+
+    # write section data
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        # Skip NULL and NOBITS data sections
+        next if $secHdr->{size} == 0 || $secHdr->{type} == 8;
+
+        # Add any needed padding between sections
+        my $pad = $pos % $secHdr->{align};
+        if ($pad > 0)
+        {
+            $pad = $secHdr->{align} - $pad;
+            print $fh join '', "\0" x $pad;
+            $pos += $pad;
+        }
+
+        print $fh pack 'H*', $secHdr->{Data};
+        $pos += $secHdr->{size};
+    }
+
+    # write section headers
+    foreach my $secHdr (@{$cubin->{secHdrs}})
+    {
+        print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}};
+    }
+
+    #write program headers
+    foreach my $prgHdr (@{$cubin->{prgHdrs}})
+    {
+        print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}};
+    }
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/PascalAs/lib/PascalAs/PascalAs.pm b/Assembler/PascalAs/lib/PascalAs/PascalAs.pm
new file mode 100644
index 0000000..eefcdf6
--- /dev/null
+++ b/Assembler/PascalAs/lib/PascalAs/PascalAs.pm
@@ -0,0 +1,1407 @@
+package PascalAs::PascalAs;
+
+require 5.10.0;
+
+use strict;
+use Data::Dumper;
+use PascalAs::PascalAsGrammar;
+use File::Spec;
+use Carp;
+
+our $VERSION = '1.06';
+
+# these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+my %relOffset  = map { $_ => 1 } qw(BRA SSY CAL PBK PCNT);
+
+# these ops use absolute addresses
+my %absOffset  = map { $_ => 1 } qw(JCAL);
+
+my %jumpOp     = (%relOffset, %absOffset);
+
+# These instructions use r0 but do not write to r0
+my %noDest     = map { $_ => 1 } qw(ST STG STS STL RED);
+
+# Map register slots to reuse control codes
+my %reuseSlots = (r8 => 1, r20 => 2, r39 => 4);
+
+# Preprocess and Assemble a source file
+sub Assemble
+{
+    my ($file, $include, $doReuse, $nowarn) = @_;
+
+    my $regMap = {};
+    $file = Preprocess($file, $include, 0, $regMap);
+    my $vectors = delete $regMap->{__vectors};
+    my $regBank = delete $regMap->{__regbank};
+
+    # initialize cubin counts
+    my $regCnt = 0;
+    my $barCnt = 0;
+
+    my ($lineNum, @instructs, %labels, $ctrl, @branches, %reuse);
+
+    # initialize the first control instruction
+    push @instructs, $ctrl = {};
+
+    foreach my $line (split "\n", $file)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        next unless preProcessLine($line);
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # Save us from crashing the display driver
+            die "It is illegal to set a Read-After-Write dependency on a memory store op (store ops don't write to a register)\n$inst->{inst}\n"
+                if exists $noDest{$inst->{op}} && ($inst->{ctrl} & 0x000e0) != 0x000e0;
+
+            # track branches/jumps/calls/etc for label remapping
+            push @branches, @instructs+0 if exists $jumpOp{$inst->{op}};
+
+            # push the control code onto the control instruction
+            push @{$ctrl->{ctrl}}, $inst->{ctrl};
+
+            # now point the instruction to its associated control instruction
+            $inst->{ctrl} = $ctrl;
+
+            # add the op name and full instruction text
+            push @instructs, $inst;
+
+            # add a 4th control instruction for every 3 instructions
+            push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            # map the label name to the index of the instruction about to be inserted
+            $labels{$1} = @instructs+0;
+        }
+        else
+        {
+            die "badly formed line at $lineNum: $line\n";
+        }
+    }
+    # add the final BRA op and align the number of instructions to a multiple of 8
+    push @{$ctrl->{ctrl}}, 0x007ff;
+    push @instructs, { op => 'BRA', inst => 'BRA 0xfffff8;' };
+    while (@instructs & 7)
+    {
+        push @instructs, $ctrl = {} if ((@instructs & 3) == 0);
+        push @{$ctrl->{ctrl}}, 0x007e0;
+        push @instructs, { op => 'NOP', inst => 'NOP;' };
+    }
+
+    # remap labels
+    foreach my $i (@branches)
+    {
+        if ($instructs[$i]{inst} !~ m'(\w+);$' || !exists $labels{$1})
+            { die "instruction has invalid label: $instructs[$i]{inst}"; }
+
+        $instructs[$i]{jump} = $labels{$1};
+
+        if (exists $relOffset{$instructs[$i]{op}})
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', (($labels{$1} - $i - 1) * 8) & 0xffffff/e; }
+        else
+            { $instructs[$i]{inst} =~ s/(\w+);$/sprintf '0x%06x;', ($labels{$1} * 8) & 0xffffff/e; }
+    }
+
+    # calculate optimal register reuse
+    # This effects register bank decisions so do it before analyzing register use
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            if ($doReuse)
+            {
+                # get any vector registers for r0
+                my @r0 = getVecRegisters($vectors, $capData);
+
+                # There are 2 reuse slots per register slot
+                # The reuse hash points to most recent instruction index where register was last used in this slot
+
+                # For writes to a register, clear any reuse opportunity
+                if (@r0 && !exists $noDest{$op})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        if (my $reuse = $reuse{$slot})
+                        {
+                            # if writing with a vector op, clear all linked registers
+                            delete $reuse->{$_} foreach @r0;
+                        }
+                    }
+                }
+                # clear cache if jumping elsewhere
+                %reuse = () if exists $jumpOp{$op};
+
+                # only track register reuse for instruction types this works with
+                if ($gram->{type}{reuse})
+                {
+                    foreach my $slot (keys %reuseSlots)
+                    {
+                        next unless exists $capData->{$slot};
+
+                        my $r = $capData->{$slot};
+                        next if $r eq 'RZ';
+                        next if $r eq $capData->{r0}; # dont reuse if we're writing this reg in the same instruction
+
+                        my $reuse = $reuse{$slot} ||= {};
+
+                        # if this register was previously marked for potential reuse
+                        if (my $p = $reuse->{$r})
+                        {
+                            # flag the previous instruction's ctrl reuse array slot
+                            $instructs[$p]{ctrl}{reuse}[($p & 3) - 1] |= $reuseSlots{$slot};
+
+                            #print "reuse $slot $r $instructs[$p]{inst}\n";
+                        }
+                        # list full, delete the oldest
+                        elsif (keys %$reuse > 2)
+                        {
+                            my $oldest = (sort {$reuse->{$a} <=> $reuse->{$b}} keys %$reuse)[0];
+                            delete $reuse->{$oldest};
+                        }
+                        # mark the new instruction for potential reuse
+                        $reuse->{$r} = $i;
+                    }
+                }
+            }
+            # if reuse is disabled then pull value from code.
+            elsif ($gram->{type}{reuse})
+            {
+                $ctrl->{reuse}[($i & 3) - 1] = genReuseCode($capData);
+            }
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # Assign registers to requested banks if possible
+    foreach my $r (sort keys %$regBank)
+    {
+        my $bank  = $regBank->{$r};
+        my $avail = $regMap->{$r};
+        foreach my $pos (0 .. $#$avail)
+        {
+            if ($bank == ($avail->[$pos] & 3))
+            {
+                # assign it, while removing the assigned register from the pool
+                $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+                last;
+            }
+        }
+    }
+
+    # calculate register live times and preferred banks for non-fixed registers.
+    # LiveTime only half implemented...
+    my (%liveTime, %pairedBanks, %reuseHistory);
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData   = parseInstruct($inst, $gram) or next;
+            my $reuseType = $gram->{type}{reuse};
+
+            # liveTimes and bank conflicts with source operands
+            my (%addReuse, %delReuse);
+            foreach my $slot (qw(r8 r20 r39))
+            {
+                my $r = $capData->{$slot} or next;
+                next if $r eq 'RZ';
+
+                my $liveR = ref $regMap->{$r} ? $r : $regMap->{$r};
+
+                # All registers should be written prior to being read..
+                if (my $liveTime = $liveTime{$liveR})
+                {
+                    # for each read set the current instruction index as the high value
+                    $liveTime->[$#$liveTime][1] = $i;
+                    push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                }
+                else
+                {
+                    warn "register used without initialization ($r): $inst\n" unless $nowarn;
+                    push @{$liveTime{$liveR}}, [$i,$i];
+                }
+
+                # Is this register active in the reuse cache?
+                my $slotHist  = $reuseHistory{$slot} ||= {};
+                my $selfReuse = $reuseType ? exists $slotHist->{$r} : 0;
+
+                #print "IADD3-1: $slot:$r (!$selfReuse && $regMap->{$r})\n" if $op eq 'IADD3';
+
+                # If this is an auto reg, look at the open banks.
+                # No need to look at banks if this register is in the reuse cache.
+                if (!$selfReuse && ref $regMap->{$r})
+                {
+                    # Look at other source operands in this instruction and flag what banks are being used
+                    foreach my $slot2 (grep {$_ ne $slot && exists $capData->{$_}} qw(r8 r20 r39))
+                    {
+                        my $r2 = $capData->{$slot2};
+                        next if $r2 eq 'RZ' || $r2 eq $r;
+
+                        my $slotHist2 = $reuseHistory{$slot2} ||= {};
+
+                        #print "IADD3-2: $slot:$r $slot2:$r2 (!$reuseType && !$slotHist2->{$r2})\n" if $op eq 'IADD3';
+
+                        # Dont be concerned with non-reuse type instructions or
+                        # If this operand is in the reuse cache, we don't care what bank it's on.
+                        if (!$reuseType || !exists $slotHist2->{$r2})
+                        {
+                            # if the operand is also an auto-allocated register then link them
+                            # Once we choose the bank for one we want to update that choice for the other register.
+                            if (ref $regMap->{$r2})
+                            {
+                                push @{$pairedBanks{$r}{pairs}}, $r2;
+                                $pairedBanks{$r}{banks} ||= [];
+                            }
+                            # For a fixed register, calculate the bank, flag it, and update the count of banks to avoid.
+                            else
+                            {
+                                my $bank = substr($regMap->{$r2},1) & 3;
+                                #print "IADD3-3: $r2:$bank\n" if $op eq 'IADD3';
+
+                                $pairedBanks{$r}{bnkCnt}++ unless $pairedBanks{$r}{banks}[$bank]++;
+                                $pairedBanks{$r}{pairs} ||= [];
+                            }
+                            # Update the total use count for this register.
+                            # This will be the number of times the register is pulled out of the bank.
+                            $pairedBanks{$r}{useCnt}++;
+                        }
+                    }
+                }
+                # update the reuse history so we know which bank conflicts we can ignore.
+                if ($reuseType)
+                {
+                    # flag these slots for addition or removal from reuseHistory
+                    if ($ctrl->{reuse}[($i & 3) - 1] & $reuseSlots{$slot})
+                        { $addReuse{$slot} = $r; }
+                    else
+                        { $delReuse{$slot} = $r; }
+                }
+            }
+            # update reuse history after we're done with the instruction (when the flag is actually in effect).
+            # we don't want to updated it in the middle since that can interfere with the checks,
+            $reuseHistory{$_}{$addReuse{$_}} = 1    foreach keys %addReuse;
+            delete $reuseHistory{$_}{$delReuse{$_}} foreach keys %delReuse;
+
+            # liveTimes for destination operands and vector registers
+            foreach my $r0 (getVecRegisters($vectors, $capData))
+            {
+                # fixed register mappings can have aliases so use the actual register value for those.
+                my $liveR = ref $regMap->{$r0} ? $r0 : $regMap->{$r0};
+
+                # If not writing treat just like a read
+                if (exists $noDest{$op})
+                {
+                    if (my $liveTime = $liveTime{$liveR})
+                    {
+                        $liveTime->[$#$liveTime][1] = $i;
+                        push @{$liveTime->[$#$liveTime]}, "$i $inst";
+                    }
+                    else
+                    {
+                        warn "register used without initialization ($r0): $inst\n" unless $nowarn;
+                        push @{$liveTime{$liveR}}, [$i,$i];
+                    }
+                }
+                # If writing, push a new bracket on this register's stack.
+                elsif (my $liveTime = $liveTime{$liveR})
+                {
+                    if ($i > $liveTime->[$#$liveTime][1])
+                    {
+                        push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                    }
+                }
+                else
+                {
+                    # Initialize the liveTime stack for this register.
+                    push @{$liveTime{$liveR}}, [$i,$i, "$i $inst"];
+                }
+            }
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+    #print Dumper(\%liveTime); exit(1);
+
+    # assign unassigned registers
+    # sort by most restricted, then most used, then name
+    foreach my $r (sort {
+                    $pairedBanks{$b}{bnkCnt} <=> $pairedBanks{$a}{bnkCnt} ||
+                    $pairedBanks{$b}{useCnt} <=> $pairedBanks{$a}{useCnt} ||
+                    $a cmp $b
+                  } keys %pairedBanks)
+    {
+        my $banks = $pairedBanks{$r}{banks};
+        my $avail = $regMap->{$r};
+
+        #printf "%10s: (%d,%d) %d,%d,%d,%d, %s\n", $r, $pairedBanks{$r}{bnkCnt}, $pairedBanks{$r}{useCnt}, @{$banks}[0,1,2,3], join ',', @$avail;
+
+        # Pick a bank with zero or the smallest number of conflicts
+        BANK: foreach my $bank (sort {$banks->[$a] <=> $banks->[$b] || $a <=> $b } (0..3))
+        {
+            # pick an available register that matches the requested bank
+            foreach my $pos (0 .. $#$avail)
+            {
+                if ($bank == ($avail->[$pos] & 3))
+                {
+                    # assign it, while removing the assigned register from the pool
+                    $regMap->{$r} = 'R' . splice @$avail, $pos, 1;
+
+                    # update bank info for any unassigned pair
+                    $pairedBanks{$_}{banks}[$bank]++ foreach @{$pairedBanks{$r}{pairs}};
+                    last BANK;
+                }
+            }
+        }
+    }
+    # Now assign any remaining to first available
+    foreach my $r (sort keys %$regMap)
+    {
+        if (ref($regMap->{$r}) eq 'ARRAY')
+        {
+            $regMap->{$r} = 'R' . shift @{$regMap->{$r}};
+        }
+    }
+    #print map "$regMap->{$_}: $_\n", sort { substr($regMap->{$a},1) <=> substr($regMap->{$b},1) } keys %$regMap;
+
+    # apply the register mapping and assemble the instructions to op codes
+    foreach my $i (0 .. $#instructs)
+    {
+        #skip control instructions
+        next unless $i & 3;
+
+        # save the original and replace the register names with numbers
+        $instructs[$i]{orig} = $instructs[$i]{inst};
+        $instructs[$i]{inst} =~ s/(?<!\.)\b(\w+)\b(?!\[)/ exists($regMap->{$1}) ? $regMap->{$1} : $1 /ge;
+
+        my ($op, $inst, $ctrl) = @{$instructs[$i]}{qw(op inst ctrl)};
+
+        my $match = 0;
+        foreach my $gram (@{$grammar{$op}})
+        {
+            # Apply the rule pattern
+            my $capData = parseInstruct($inst, $gram) or next;
+
+            # update the register count
+            foreach my $r (qw(r0 r8 r20 r39))
+            {
+                next unless exists($capData->{$r}) && $capData->{$r} ne 'RZ';
+
+                # get numeric portion of regname
+                my $val = substr $capData->{$r}, 1;
+
+                my @r0 = getVecRegisters($vectors, $capData);
+                my @r8 = getAddrVecRegisters($vectors, $capData);
+
+                # smart enough to count vector registers for memory instructions.
+                my $regInc = $r eq 'r0' ? scalar(@r0) || 1 : 1;
+                my $regInc = $r eq 'r8' ? scalar(@r8) || 1 : 1;
+
+                if ($val + $regInc > $regCnt)
+                {
+                    $regCnt = $val + $regInc;
+                    #print "$val $regCnt $regInc\n";
+                }
+            }
+            # update the barrier resource count
+            if ($op eq 'BAR')
+            {
+                if (exists $capData->{i8w4})
+                {
+                    $barCnt = $capData->{i8w4}+1 if $capData->{i8w4}+1 > $barCnt;
+                }
+                # if a barrier value is a register, assume the maximum
+                elsif (exists $capData->{r8})
+                {
+                    $barCnt = 16;
+                }
+            }
+            # Generate the op code.
+            my ($code, $reuse) = genCode($op, $gram, $capData);
+            $instructs[$i]{code} = $code;
+
+            # cache this for final pass when we want to calculate reuse stats.
+            if ($gram->{type}{reuse})
+                { $instructs[$i]{caps} = $capData; }
+            # use the parsed value of reuse for non-reuse type instructions
+            else
+                { $ctrl->{reuse}[($i & 3) - 1] = $reuse; }
+
+
+            $match = 1;
+            last;
+        }
+        unless ($match)
+        {
+            print "$_->{rule}\n\n" foreach @{$grammar{$op}};
+            die "Unable to encode instruction: $inst\n";
+        }
+    }
+
+    # final pass to piece together control codes
+    my (@codes, %reuseHistory, @exitOffsets, @ctaidOffsets, $ctaidzUsed);
+    foreach my $i (0 .. $#instructs)
+    {
+        # op code
+        if ($i & 3)
+        {
+            push @codes, $instructs[$i]{code};
+
+            if ($instructs[$i]{caps})
+            {
+                # calculate stats on registers
+                registerHealth(\%reuseHistory, $instructs[$i]{ctrl}{reuse}[($i & 3) - 1], $instructs[$i]{caps}, $i * 8, "$instructs[$i]{inst} ($instructs[$i]{orig})", $nowarn);
+            }
+            if ($instructs[$i]{inst} =~ m'EXIT')
+            {
+                push @exitOffsets, (scalar(@codes)-1)*8;
+            }
+            elsif ($instructs[$i]{inst} =~ m'SR_CTAID\.(X|Y|Z)')
+            {
+                push @ctaidOffsets, (scalar(@codes)-1)*8;
+                $ctaidzUsed = 1 if $1 eq 'Z';
+            }
+        }
+        # control code
+        else
+        {
+            my ($ctrl, $ruse) = @{$instructs[$i]}{qw(ctrl reuse)};
+            push @codes,
+                ($ctrl->[0] <<  0) | ($ctrl->[1] << 21) | ($ctrl->[2] << 42) | # ctrl codes
+                ($ruse->[0] << 17) | ($ruse->[1] << 38) | ($ruse->[2] << 59);  # reuse codes
+        }
+    }
+
+    # return the kernel data
+    return {
+        RegCnt       => $regCnt,
+        BarCnt       => $barCnt,
+        ExitOffsets  => \@exitOffsets,
+        CTAIDOffsets => \@ctaidOffsets,
+        CTAIDZUsed   => $ctaidzUsed,
+        ConflictCnt  => $reuseHistory{conflicts},
+        ReuseCnt     => $reuseHistory{reuse},
+        ReuseTot     => $reuseHistory{total},
+        ReusePct     => ($reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0),
+        KernelData   => \@codes,
+    };
+}
+
+# Useful for testing op code coverage of existing code, extracting new codes and flags
+sub Test
+{
+    my ($fh, $printConflicts, $all) = @_;
+
+    my @instructs;
+    my %reuseHistory;
+    my ($pass, $fail) = (0,0);
+
+    while (my $line = <$fh>)
+    {
+        my (@ctrl, @reuse);
+
+        next unless processSassCtrlLine($line, \@ctrl, \@reuse);
+
+        foreach my $fileReuse (@reuse)
+        {
+            $line = <$fh>;
+
+            my $inst = processSassLine($line) or next;
+
+            $inst->{reuse} = $fileReuse;
+            my $fileCode = $inst->{code};
+
+            if (exists $relOffset{$inst->{op}})
+            {
+                # these ops need to be converted from absolute addresses to relative in the sass output by cuobjdump
+                $inst->{inst} =~ s/(0x[0-9a-f]+)/sprintf '0x%06x', ((hex($1) - $inst->{num} - 8) & 0xffffff)/e;
+            }
+
+            my $match = 0;
+            foreach my $gram (@{$grammar{$inst->{op}}})
+            {
+                my $capData = parseInstruct($inst->{inst}, $gram) or next;
+                my @caps;
+
+                # Run in test mode to list what capture groups were captured
+                my ($code, $reuse) = genCode($inst->{op}, $gram, $capData, \@caps);
+
+                # Detect register bank conflicts but only for reuse type instructions.
+                # If a bank conflict is avoided by a reuse flag then ignore it.
+                registerHealth(\%reuseHistory, $reuse, $capData, $inst->{num}, $printConflicts ? $inst->{inst} : '') if $gram->{type}{reuse};
+
+                $inst->{caps}      = join ', ', sort @caps;
+                $inst->{codeDiff}  = $fileCode  ^ $code;
+                $inst->{reuseDiff} = $fileReuse ^ $reuse;
+
+                # compare calculated and file values
+                if ($code == $fileCode && $reuse == $fileReuse)
+                {
+                    $inst->{grade} = 'PASS';
+                    push @instructs, $inst if $all;
+                    $pass++;
+                }
+                else
+                {
+                    $inst->{grade} = 'FAIL';
+                    push @instructs, $inst;
+                    $fail++;
+                }
+                $match = 1;
+                last;
+            }
+            unless ($match)
+            {
+                $inst->{grade}     = 'FAIL';
+                $inst->{codeDiff}  = $fileCode;
+                $inst->{reuseDiff} = $fileReuse;
+                push @instructs, $inst;
+                $fail++;
+            }
+        }
+    }
+    my %maxLen;
+    foreach (@instructs)
+    {
+        $maxLen{$_->{op}} = length($_->{ins}) if length($_->{ins}) > $maxLen{$_->{op}};
+    }
+    my ($lastOp, $template);
+    foreach my $inst (sort {
+        $a->{op}        cmp $b->{op}        ||
+        $a->{codeDiff}  <=> $b->{codeDiff}  ||
+        $a->{reuseDiff} <=> $b->{reuseDiff} ||
+        $a->{ins}       cmp $b->{ins}
+        } @instructs)
+    {
+        if ($lastOp ne $inst->{op})
+        {
+            $lastOp   = $inst->{op};
+            $template = "%s 0x%016x %x 0x%016x %x %5s%-$maxLen{$lastOp}s   %s\n";
+            printf "\n%s %-18s %s %-18s %s %-5s%-$maxLen{$lastOp}s   %s\n", qw(Grad OpCode R opCodeDiff r Pred Instruction Captures);
+        }
+        printf $template, @{$inst}{qw(grade code reuse codeDiff reuseDiff pred ins caps)};
+    }
+    my $reusePct = $reuseHistory{total} ? 100 * $reuseHistory{reuse} / $reuseHistory{total} : 0;
+
+    printf "\nRegister Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\nOp Code Coverage Totals: Pass: $pass Fail: $fail\n",
+        $reuseHistory{conflicts}, $reusePct, $reuseHistory{reuse}, $reuseHistory{total};
+
+    return $fail;
+}
+
+# Convert cuobjdump sass to the working format
+sub Extract
+{
+    my ($in, $out, $params) = @_;
+
+    my %paramMap;
+    my %constants =
+    (
+        blockDimX => 'c[0x0][0x8]',
+        blockDimY => 'c[0x0][0xc]',
+        blockDimZ => 'c[0x0][0x10]',
+        gridDimX  => 'c[0x0][0x14]',
+        gridDimY  => 'c[0x0][0x18]',
+        gridDimZ  => 'c[0x0][0x1c]',
+    );
+    print $out "<CONSTANT_MAPPING>\n";
+
+    foreach my $const (sort keys %constants)
+    {
+        print $out "    $const : $constants{$const}\n";
+        $paramMap{$constants{$const}} = $const;
+    }
+    print $out "\n";
+
+    foreach my $p (@$params)
+    {
+        my ($ord,$offset,$size,$align) = split ':', $p;
+
+        if ($size > 4)
+        {
+            my $num = 0;
+            $offset = hex $offset;
+            while ($size > 0)
+            {
+                my $param = sprintf 'param_%d[%d]', $ord, $num;
+                my $const = sprintf 'c[0x0][0x%x]', $offset;
+                $paramMap{$const} = $param;
+                print $out "    $param : $const\n";
+                $size   -= 4;
+                $offset += 4;
+                $num    += 1;
+            }
+        }
+        else
+        {
+            my $param = sprintf 'param_%d', $ord;
+            my $const = sprintf 'c[0x0][%s]', $offset;
+            $paramMap{$const} = $param;
+            print $out "    $param : $const\n";
+        }
+    }
+    print $out "</CONSTANT_MAPPING>\n\n";
+
+    my %labels;
+    my $labelnum = 1;
+
+    my @data;
+    FILE: while (my $line = <$in>)
+    {
+        my (@ctrl, @ruse);
+        next unless processSassCtrlLine($line, \@ctrl, \@ruse);
+
+        CTRL: foreach my $ctrl (@ctrl)
+        {
+            $line = <$in>;
+
+            my $inst = processSassLine($line) or next CTRL;
+
+            # Convert branch/jump/call addresses to labels
+            if (exists($jumpOp{$inst->{op}}) && $inst->{ins} =~ m'(0x[0-9a-f]+)')
+            {
+                my $target = hex($1);
+
+                # skip the final BRA and stop processing the file
+                last FILE if $inst->{op} eq 'BRA' && ($target == $inst->{num} || $target == $inst->{num}-8);
+
+                # check to see if we've already generated a label for this target address
+                my $label = $labels{$target};
+                unless ($label)
+                {
+                    # generate a label name and cache it
+                    $label = $labels{$target} = "TARGET$labelnum";
+                    $labelnum++;
+                }
+                # replace address with name
+                $inst->{ins} =~ s/(0x[0-9a-f]+)/$label/;
+            }
+            $inst->{ins} =~ s/(c\[0x0\])\s*(\[0x[0-9a-f]+\])/ $paramMap{$1 . $2} || $1 . $2 /eg;
+
+            $inst->{ctrl} = printCtrl($ctrl);
+
+            push @data, $inst;
+        }
+    }
+    # make a second pass now that we have the complete instruction address to label mapping
+    foreach my $inst (@data)
+    {
+        print $out "$labels{$inst->{num}}:\n" if exists $labels{$inst->{num}};
+        printf $out "%s %5s%s\n", @{$inst}{qw(ctrl pred ins)};
+    }
+}
+
+my $CommentRe  = qr'^[\t ]*<COMMENT>.*?^\s*</COMMENT>\n?'ms;
+my $IncludeRe  = qr'^[\t ]*<INCLUDE\s+file="([^"]+)"\s*/?>\n?'ms;
+my $CodeRe     = qr'^[\t ]*<CODE(\d*)>(.*?)^\s*<\/CODE\1>\n?'ms;
+my $ConstMapRe = qr'^[\t ]*<CONSTANT_MAPPING>(.*?)^\s*</CONSTANT_MAPPING>\n?'ms;
+my $RegMapRe   = qr'^[\t ]*<REGISTER_MAPPING>(.*?)^\s*</REGISTER_MAPPING>\n?'ms;
+my $ScheduleRe = qr'^[\t ]*<SCHEDULE_BLOCK>(.*?)^\s*</SCHEDULE_BLOCK>\n?'ms;
+my $InlineRe   = qr'\[(\+|\-)(.+?)\1\]'ms;
+
+sub IncludeFile
+{
+    my ($file, $include) = @_;
+    my ($vol,$dir,$name) = File::Spec->splitpath($file);
+    local $/;
+    my $fh;
+    if (!open $fh, $file)
+    {
+        open $fh, File::Spec->catpath(@$include, $name) or die "Could not open file for INCLUDE: $file ($!)\n";
+    }
+    my $content = <$fh>;
+    close $fh;
+    return $content;
+}
+
+sub Preprocess
+{
+    my ($file, $include, $debug, $regMap) = @_;
+
+    my $constMap = {};
+    my $removeRegMap;
+    if ($regMap)
+        { $removeRegMap = 1; }
+    else
+        { $regMap = {}; }
+
+    # include nested files
+    1 while $file =~ s|$IncludeRe| IncludeFile($1, $include) |eg;
+
+    # Strip out comments
+    $file =~ s|$CommentRe||g;
+
+    # Execute the CODE sections (old way to run code, to be deprecated)
+    1 while $file =~ s|$CodeRe|
+        my $out = eval "package PascalAs::PascalAs::CODE; $2";
+        $@ ? die("CODE:\n$2\n\nError: $@\n") : $out |eg;
+
+    # Execute the inline code (new way)
+    $file =~ s|$InlineRe|
+        my ($type, $code) = ($1, $2);
+        my $out = eval "package PascalAs::PascalAs::CODE; $code";
+        $@ ? die("CODE:\n$code\n\nError: $@\n") : $type eq "+" ? $out : "" |eg;
+
+    #Pull in the constMap
+    $file =~ s/$ConstMapRe/ setConstMap($constMap, $1) /eg;
+
+    my @newFile;
+    foreach my $line (split "\n", $file)
+    {
+        # skip comments
+        if ($line !~ m'^\s*(?:#|//).*')
+        {
+            $line =~ s|(\w+(?:\[\d+\])?)| exists $constMap->{$1} ? $constMap->{$1} : $1 |eg;
+        }
+        push @newFile, $line;
+    }
+    $file = join "\n", @newFile;
+
+    # Pull in the reg map first as the Scheduler will need it to handle vector instructions
+    # Remove the regmap if we're going on to assemble
+    $file =~ s/$RegMapRe/ setRegisterMap($regMap, $1); $removeRegMap ? '' : $& /eg;
+
+    # Pick out the SCHEDULE_BLOCK sections
+    my @schedBlocks = $file =~ /$ScheduleRe/g;
+
+    # Schedule them
+    foreach my $i (0 .. $#schedBlocks)
+    {
+        # XMAD macros should only appear in SCHEDULE_BLOCKs
+        $schedBlocks[$i] = replaceXMADs($schedBlocks[$i]);
+
+        $schedBlocks[$i] = Scheduler($schedBlocks[$i], $i+1, $regMap, $debug);
+    }
+
+    # Replace the results
+    $file =~ s|$ScheduleRe| shift @schedBlocks |eg;
+
+    return $file;
+}
+
+# break the registers down into source and destination categories for the scheduler
+my %srcReg   = map { $_ => 1 } qw(r8 r20 r39 p12 p29 p39 X);
+my %destReg  = map { $_ => 1 } qw(r0 p0 p3 p45 p48 CC);
+my %regops   = (%srcReg, %destReg);
+my @itypes   = qw(class lat rlat tput dual);
+
+sub Scheduler
+{
+    my ($block, $blockNum, $regMap, $debug) = @_;
+
+    my $vectors = $regMap->{__vectors};
+    my $lineNum = 0;
+
+    my (@instructs, @comments, $ordered, $first);
+    foreach my $line (split "\n", $block)
+    {
+        # keep track of line nums in the physical file
+        $lineNum++;
+
+        unless (preProcessLine($line))
+        {
+            push @comments, $line if $line =~ m'\S';
+            next;
+        }
+
+        # match an instruction
+        if (my $inst = processAsmLine($line, $lineNum))
+        {
+            # if the first instruction in the block is waiting on a dep, it should go first.
+            $inst->{first}   = !$first++ && ($inst->{ctrl} & 0x1f800) ? 0 : 1;
+
+            # if the instruction has a stall of zero set, it's meant to be last (to mesh with next block)
+            #$inst->{first}   = $inst->{ctrl} & 0x0000f ? 1 : 2;
+            $inst->{exeTime} = 0;
+            $inst->{order}   = $ordered++ if $ordered;
+            push @instructs, $inst;
+        }
+        # match a label
+        elsif ($line =~ m'^([a-zA-Z]\w*):')
+        {
+            die "SCHEDULE_BLOCK's cannot contain labels. block: $blockNum line: $lineNum\n";
+        }
+        # open an ORDERED block
+        elsif ($line =~ m'^<ORDERED>')
+        {
+            die "you cannot use nested <ORDERED> tags" if $ordered;
+            $ordered = 1;
+        }
+        # close an ORDERED block
+        elsif ($line =~ m'^</ORDERED>')
+        {
+            die "missing opening <ORDERED> for closing </ORDERED> tag" if !$ordered;
+            $ordered = 0;
+        }
+        else
+        {
+            die "badly formed line at block: $blockNum line: $lineNum: $line\n";
+        }
+    }
+
+    my (%writes, %reads, @ready, @schedule, $orderedParent);
+    # assemble the instructions to op codes
+    foreach my $instruct (@instructs)
+    {
+        my $match = 0;
+        foreach my $gram (@{$grammar{$instruct->{op}}})
+        {
+            my $capData = parseInstruct($instruct->{inst}, $gram) or next;
+            my (@dest, @src);
+
+            # copy over instruction types for easier access
+            @{$instruct}{@itypes} = @{$gram->{type}}{@itypes};
+
+            # A predicate prefix is treated as a source reg
+            push @src, $instruct->{predReg} if $instruct->{pred};
+
+            # Handle P2R and R2P specially
+            if ($instruct->{op} =~ m'P2R|R2P' && $capData->{i20w7})
+            {
+                my $list = $instruct->{op} eq 'R2P' ? \@dest : \@src;
+                my $mask = hex($capData->{i20w7});
+                foreach my $p (0..6)
+                {
+                    if ($mask & (1 << $p))
+                    {
+                        push @$list, "P$p";
+                    }
+                    # make this instruction dependent on any predicates it's not setting
+                    # this is to prevent a race condition for any predicate sets that are pending
+                    elsif ($instruct->{op} eq 'R2P')
+                    {
+                        push @src, "P$p";
+                    }
+                }
+                # These instructions can't be dual issued
+                $instruct->{nodual} = 1;
+            }
+
+            # Populate our register source and destination lists, skipping any zero or true values
+            foreach my $operand (grep { exists $regops{$_} } sort keys %$capData)
+            {
+                # figure out which list to populate
+                my $list = exists($destReg{$operand}) && !exists($noDest{$instruct->{op}}) ? \@dest : \@src;
+
+                # Filter out RZ and PT
+                my $badVal = substr($operand,0,1) eq 'r' ? 'RZ' : 'PT';
+
+                if ($capData->{$operand} ne $badVal)
+                {
+                    # add the value to list with the correct prefix
+                    push @$list,
+                        $operand eq 'r0' ? map(getRegNum($regMap, $_), getVecRegisters($vectors, $capData)) :
+                        $operand eq 'r8' ? map(getRegNum($regMap, $_), getAddrVecRegisters($vectors, $capData)) :
+                        $operand eq 'CC' ? 'CC' :
+                        $operand eq 'X'  ? 'CC' :
+                        getRegNum($regMap, $capData->{$operand});
+                }
+            }
+            $instruct->{const} = 1 if exists($capData->{c20}) || exists($capData->{c39});
+
+            # Find Read-After-Write dependencies
+            foreach my $src (grep { exists $writes{$_} } @src)
+            {
+                # Memory operations get delayed access to registers but not to the predicate
+                my $regLatency = $src eq $instruct->{predReg} ? 0 : $instruct->{rlat};
+
+                # the parent should be the most recently added dest op to the stack
+                foreach my $parent (@{$writes{$src}})
+                {
+                    # add this instruction as a child of the parent
+                    # set the edge to the total latency of reg source availability
+                    #print "R $parent->{inst}\n\t\t$instruct->{inst}\n";
+                    my $latency = $src =~ m'^P\d' ? 13 : $parent->{lat};
+                    push @{$parent->{children}}, [$instruct, $latency - $regLatency];
+                    $instruct->{parents}++;
+
+                    # if the destination was conditionally executed, we also need to keep going back till it wasn't
+                    last unless $parent->{pred};
+                }
+            }
+
+            # Find Write-After-Read dependencies
+            foreach my $dest (grep { exists $reads{$_} } @dest)
+            {
+                # Flag this instruction as dependent to any previous read
+                foreach my $reader (@{$reads{$dest}})
+                {
+                    # no need to stall for these types of dependencies
+                    #print "W $reader->{inst} \t\t\t $instruct->{inst}\n";
+                    push @{$reader->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                # Once dependence is marked we can clear out the read list (unless this write was conditional).
+                # The assumption here is that you would never want to write out a register without
+                # subsequently reading it in some way prior to writing it again.
+                delete $reads{$dest} unless $instruct->{pred};
+            }
+
+            # Enforce instruction ordering where requested
+            if ($instruct->{order})
+            {
+                if ($orderedParent)
+                {
+                    push @{$orderedParent->{children}}, [$instruct, 0];
+                    $instruct->{parents}++;
+                }
+                $orderedParent = $instruct;
+            }
+            elsif ($orderedParent)
+                {  $orderedParent = 0; }
+
+            # For a dest reg, push it onto the write stack
+            unshift @{$writes{$_}}, $instruct foreach @dest;
+
+            # For a src reg, push it into the read list
+            push @{$reads{$_}}, $instruct foreach @src;
+
+            # if this instruction has no dependencies it's ready to go
+            push @ready, $instruct if !exists $instruct->{parents};
+
+            $match = 1;
+            last;
+        }
+        die "Unable to recognize instruction at block: $blockNum line: $lineNum: $instruct->{inst}\n" unless $match;
+    }
+    %writes = ();
+    %reads  = ();
+
+    if (@ready)
+    {
+        # update dependent counts for sorting hueristic
+        my $readyParent = { children => [ map { [ $_, 1 ] } @ready ], inst => "root" };
+
+        countUniqueDescendants($readyParent, {});
+        updateDepCounts($readyParent, {});
+
+        # sort the initial ready list
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "0: Initial Ready List State:\n\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(first exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    # Process the ready list, adding new instructions to the list as we go.
+    my $clock = 0;
+    while (my $instruct = shift @ready)
+    {
+        my $stall = $instruct->{stall};
+
+        # apply the stall to the previous instruction
+        if (@schedule && $stall < 16)
+        {
+            my $prev = $schedule[$#schedule];
+
+            # if stall is greater than 4 then also yield
+            # the yield flag is required to get stall counts 12-15 working correctly.
+            $prev->{ctrl} &= $stall > 4 ? 0x1ffe0 : 0x1fff0;
+            $prev->{ctrl} |= $stall;
+            $clock += $stall;
+        }
+        # For stalls bigger than 15 we assume the user is managing it with a barrier
+        else
+        {
+            $instruct->{ctrl} &= 0x1fff0;
+            $instruct->{ctrl} |= 1;
+            $clock += 1;
+        }
+        print "$clock: $instruct->{inst}\n" if $debug;
+
+        # add a new instruction to the schedule
+        push @schedule, $instruct;
+
+        # update each child with a new earliest execution time
+        if (my $children = $instruct->{children})
+        {
+            foreach (@$children)
+            {
+                my ($child, $latency) = @$_;
+
+                # update the earliest clock value this child can safely execute
+                my $earliest = $clock + $latency;
+                $child->{exeTime} = $earliest if $child->{exeTime} < $earliest;
+
+                print "\t\t$child->{exeTime},$child->{parents} $child->{inst}\n" if $debug;
+
+                # decrement parent count and add to ready queue if none remaining.
+                push @ready, $child if --$child->{parents} < 1;
+            }
+            delete $instruct->{children};
+        }
+
+        # update stall and mix values in the ready queue on each iteration
+        foreach my $ready (@ready)
+        {
+            # calculate how many instructions this would cause the just added instruction to stall.
+            $stall = $ready->{exeTime} - $clock;
+            $stall = 1 if $stall < 1;
+
+            # if using the same compute resource as the prior instruction then limit the throughput
+            if ($ready->{class} eq $instruct->{class})
+            {
+                $stall = $ready->{tput} if $stall < $ready->{tput};
+            }
+            # dual issue with a simple instruction (tput <= 2)
+            # can't dual issue two instructions that both load a constant
+            elsif ($ready->{dual} && !$instruct->{dual} && $instruct->{tput} <= 2 && !$instruct->{nodual} &&
+                   $stall == 1 && $ready->{exeTime} <= $clock && !($ready->{const} && $instruct->{const}))
+            {
+                $stall = 0;
+            }
+            $ready->{stall} = $stall;
+
+            # add an instruction class mixing huristic that catches anything not handled by the stall
+            $ready->{mix} = $ready->{class} ne $instruct->{class} || 0;
+        }
+
+        # sort the ready list by stall time, mixing huristic, dependencies and line number
+        @ready = sort {
+            $a->{first}   <=> $b->{first}  ||
+            $a->{stall}   <=> $b->{stall}  ||
+            $b->{mix}     <=> $a->{mix}    ||
+            $b->{deps}    <=> $a->{deps}   ||
+            $a->{lineNum} <=> $b->{lineNum}
+            } @ready;
+
+        if ($debug)
+        {
+            print  "\tf,ext,stl,mix,dep,lin, inst\n";
+            printf "\t%d,%3s,%3s,%3s,%3s,%3s, %s\n", @{$_}{qw(f exeTime stall mix deps lineNum inst)} foreach @ready;
+        }
+    }
+
+    my $out;
+    #$out .= "$_\n" foreach @comments;
+    $out .= join('', printCtrl($_->{ctrl}), @{$_}{qw(space inst comment)}, "\n") foreach @schedule;
+    return $out;
+}
+
+sub setConstMap
+{
+    my ($constMap, $constMapText) = @_;
+
+    foreach my $line (split "\n", $constMapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my ($name, $value) = split '\s*:\s*', $line;
+
+        $constMap->{$name} = $value;
+    }
+    return;
+}
+
+sub setRegisterMap
+{
+    my ($regMap, $regmapText) = @_;
+
+    my $vectors = $regMap->{__vectors} ||= {};
+    my $regBank = $regMap->{__regbank} ||= {};
+    my %aliases;
+
+    foreach my $line (split "\n", $regmapText)
+    {
+        # strip leading space
+        $line =~ s|^\s+||;
+        # strip comments
+        $line =~ s{(?:#|//).*}{};
+        # strip trailing space
+        $line =~ s|\s+$||;
+        # skip blank lines
+        next unless $line =~ m'\S';
+
+        my $auto  = $line =~ /~/;
+        my $share = $line =~ /=/;
+
+        my ($regNums, $regNames) = split '\s*[:~=]\s*', $line;
+
+        my (@numList, @nameList, %vecAliases);
+        foreach my $num (split '\s*,\s*', $regNums)
+        {
+            my ($start, $stop) = split '\s*\-\s*', $num;
+            die "REGISTER_MAPPING Error: Bad register number or range: $num\nLine: $line\nFull Context:\n$regmapText\n" if grep m'\D', $start, $stop;
+            push @numList, ($start .. $stop||$start);
+        }
+        foreach my $fullName (split '\s*,\s*', $regNames)
+        {
+            if ($fullName =~ m'^(\w+)<((?:\d+(?:\s*\-\s*\d+)?\s*\|?\s*)+)>(\w*)(?:\[([0-3])\])?$')
+            {
+                my ($name1, $name2, $bank) = ($1, $3, $4);
+                foreach (split '\s*\|\s*', $2)
+                {
+                    my ($start, $stop) = split '\s*\-\s*';
+                    foreach my $r (map "$name1$_$name2", $start .. $stop||$start)
+                    {
+                        # define an alias for use in vector instructions that omits the number portion
+                        $aliases{$r} = "$name1$name2" unless exists $aliases{$r};
+                        push @nameList, $r;
+                        $regBank->{$r} = $bank if $auto && defined $bank;
+                        warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $bank;
+                    }
+                }
+            }
+            elsif ($fullName =~ m'^(\w+)(?:\[([0-3])\])?$')
+            {
+                push @nameList, $1;
+                $regBank->{$1} = $2 if $auto && defined $2;
+                warn "Cannot request a bank for a fixed register range: $fullName\n" if !$auto && defined $2;
+            }
+            else
+            {
+                die "Bad register name: '$fullName' at: $line\n";
+            }
+        }
+        die "Missmatched register mapping at: $line\n" if !$share && @numList < @nameList;
+        die "Missmatched register mapping at: $line\n" if $share && @numList > 1;
+
+        # detect if this list is monotonically ascending with no gaps
+        my $i = 0;
+        while ($i < $#numList-1)
+        {
+            last if $numList[$i] + 1 != $numList[$i+1];
+            $i++;
+        }
+        my $ascending = $i+1 == $#numList;
+
+        foreach my $n (0..$#nameList)
+        {
+            die "register defined twice: $nameList[$n]" if exists $regMap->{$nameList[$n]};
+
+            if ($auto)
+            {
+                # assign possible values to be assigned on assembly
+                $regMap->{$nameList[$n]} = \@numList;
+            }
+            elsif ($share)
+            {
+                # each name shares the same single register
+                $regMap->{$nameList[$n]} = 'R' . $numList[0];
+            }
+            else
+            {
+                $regMap->{$nameList[$n]} = 'R' . $numList[$n];
+                # flag any even register as a potential vector
+                if ($ascending && ($numList[$n] & 1) == 0)
+                {
+                    # constrain potential range to vector alignment
+                    my $end = $n + ($numList[$n] & 2 || $n + 3 > $#nameList ? 1 : 3);
+                    if ($end <= $#nameList)
+                    {
+                        $vectors->{$nameList[$n]} = [ @nameList[$n .. $end] ];
+                        #setup an alias for the base name without the number
+                        if (exists $aliases{$nameList[$n]} && !exists $regMap->{$aliases{$nameList[$n]}})
+                        {
+                            $regMap->{$aliases{$nameList[$n]}}  = $regMap->{$nameList[$n]};
+                            $vectors->{$aliases{$nameList[$n]}} = $vectors->{$nameList[$n]};
+                            delete $aliases{$nameList[$n]};
+                        }
+                    }
+                }
+            }
+        }
+    }
+    #print Dumper($regMap); exit(1);
+}
+
+sub preProcessLine
+{
+    # strip leading space
+    $_[0] =~ s|^\s+||;
+
+    # preserve comment but check for emptiness
+    my $val = shift;
+
+    # strip comments
+    $val =~ s{(?:#|//).*}{};
+
+    # skip blank lines
+    return $val =~ m'\S';
+}
+
+# traverse the graph and count total descendants per node.
+# only count unique nodes (by lineNum)
+sub countUniqueDescendants
+{
+    my ($node, $edges) = @_;
+
+    #warn "$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (grep $_->[1], @$children) # skip WaR deps and traversed edges
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+
+            $node->{deps}{$_}++ foreach countUniqueDescendants($child->[0], $edges);
+        }
+    }
+    else
+    {
+        return $node->{lineNum};
+    }
+    return ($node->{lineNum}, keys %{$node->{deps}});
+}
+# convert hash to count for easier sorting.
+sub updateDepCounts
+{
+    my ($node, $edges) = @_;
+
+    #warn "$node->{inst}\n";
+
+    if (my $children = $node->{children})
+    {
+        foreach my $child (@$children)
+        {
+            next if $edges->{"$node->{lineNum}^$child->[0]{lineNum}"}++;
+            updateDepCounts($child->[0], $edges);
+        }
+    }
+    $node->{deps} = ref $node->{deps} ? keys %{$node->{deps}} : $node->{deps}+0;
+}
+
+# Detect register bank conflicts and calculate reuse stats
+sub registerHealth
+{
+    my ($reuseHistory, $reuseFlags, $capData, $instAddr, $inst, $nowarn) = @_;
+
+    my (@banks, @conflicts);
+
+    foreach my $slot (qw(r8 r20 r39))
+    {
+        my $r = $capData->{$slot} or next;
+        next if $r eq 'RZ';
+
+        my $slotHist = $reuseHistory->{$slot} ||= {};
+
+        $reuseHistory->{total}++;
+
+        # if this register is in active reuse then ignore for bank conflict checking.
+        if (exists $slotHist->{$r})
+        {
+            $reuseHistory->{reuse}++;
+        }
+        else
+        {
+            # extract number from reg and take the modulo-4 value.  This is the bank id.
+            my $bank = substr($r,1) & 3;
+
+            # check for conflict
+            if ($banks[$bank] && $banks[$bank] ne $r)
+            {
+                push @conflicts, $banks[$bank] if !@conflicts;
+                push @conflicts, $r;
+
+                $reuseHistory->{conflicts}++;
+            }
+            $banks[$bank] = $r;
+        }
+
+        # update the history
+        if ($reuseFlags & $reuseSlots{$slot})
+            { $slotHist->{$r} = 1; }
+        else
+            { delete $slotHist->{$r};  }
+    }
+    if ($inst && @conflicts && !$nowarn)
+    {
+        printf "CONFLICT at 0x%04x (%s): $inst\n", $instAddr, join(',', @conflicts);
+    }
+    return scalar @conflicts;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+PascalAs::PascalAs - Assembler for NVIDIA Maxwell architecture
+
+=head1 SYNOPSIS
+
+    Pascalas.pl [opts]
+
+=head1 DESCRIPTION
+
+See the documentation at: https://github.com/NervanaSystems/pascalas
+
+=head1 SEE ALSO
+
+See the documentation at: https://github.com/NervanaSystems/pascalas
+
+
+=head1 AUTHOR
+
+Scott Gray, E<lt>sgray@nervanasys.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+The MIT License (MIT)
+
+Copyright (c) 2014 Scott Gray
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+=cut
diff --git a/Assembler/PascalAs/lib/PascalAs/PascalAsGrammar.pm b/Assembler/PascalAs/lib/PascalAs/PascalAsGrammar.pm
new file mode 100644
index 0000000..bf25fb8
--- /dev/null
+++ b/Assembler/PascalAs/lib/PascalAs/PascalAsGrammar.pm
@@ -0,0 +1,1437 @@
+package PascalAs::PascalAsGrammar;
+
+use strict;
+use Carp;
+use Exporter;
+use Data::Dumper;
+our @ISA = qw(Exporter);
+
+our @EXPORT = qw(
+    %grammar %flags
+    parseInstruct genCode genReuseCode
+    processAsmLine processSassLine processSassCtrlLine
+    replaceXMADs printCtrl readCtrl getRegNum getVecRegisters getAddrVecRegisters
+);
+
+require 5.10.0;
+
+# Helper functions for operands
+sub getI
+{
+    my ($orig, $pos, $mask) = @_;
+    my $val = $orig;
+    my $neg = $val =~ s|^\-||;
+
+    # parse out our custom index immediates for addresses
+    if ($val  =~ m'^(\d+)[xX]<([^>]+)>')
+    {
+        # allow any perl expression and multiply result by leading decimal.
+        # also allow global scalar varibles in the expression.
+        my $mul = $1;
+        my $exp = $2;
+        # strip leading zeros (don't interpret numbers as octal)
+        $exp =~ s/(?<!\d)0+(?=[1-9])//g;
+        my @globals = $exp =~ m'\$\w+'g;
+        my $our = @globals ? ' our (' . join(',',@globals) . ');' : '';
+        $val = $mul * eval "package PascalAs::PascalAs::CODE;$our $exp";
+        #print "$val = $mul x $exp\n"; # if $our;
+    }
+    # hexidecial value
+    elsif ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # otherwise val is a simple decimal value that doesn't need to be modified
+
+    if ( $neg )
+    {
+        # if the mask removes the sign bit the "neg" flag adds it back on the code somewhere else
+        $val = -$val;
+        $val &= $mask;
+    }
+    if (($val & $mask) != $val)
+    {
+        die sprintf "Immediate value out of range(0x%x): 0x%x ($orig)\n", $mask, $val;
+    }
+    return $val << $pos;
+}
+sub getF
+{
+    my ($val, $pos, $type, $trunc) = @_;
+    # hexidecial value
+    if ($val  =~ m'^0x[0-9a-zA-Z]+')
+    {
+        $val = hex($val);
+    }
+    # support infinity
+    elsif ($val =~ m'INF'i)
+    {
+        $val = $trunc ? ($type eq 'f' ? 0x7f800 : 0x7ff00) : 0x7f800000;
+    }
+    else
+    {
+        $val = unpack(($type eq 'f' ? 'L' : 'Q'), pack $type, $val);
+
+        # strip off sign bit if truncating.  It will added elsewhere in the code by the flag capture.
+        $val = ($val >> $trunc) & 0x7ffff if $trunc;
+    }
+    return $val << $pos;
+}
+sub getR
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^R(\d+|Z)$' && $1 < 255)
+    {
+        $val = $1 eq 'Z' ? 0xff : $1;
+    }
+    else
+    {
+        die "Bad register name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getP
+{
+    my ($val, $pos) = @_;
+    if ($val =~ m'^P(\d|T)$' && $1 < 7)
+    {
+        $val = $1 eq 'T' ? 7 : $1;
+    }
+    else
+    {
+        die "Bad predicate name found: $val\n";
+    }
+    return $val << $pos;
+}
+sub getC { ((hex($_[0]) >> 2) & 0x7fff) << 20 }
+
+# Map operands into their value and position in the op code.
+my %operands =
+(
+    p0      => sub { getP($_[0], 0)  },
+    p3      => sub { getP($_[0], 3)  },
+    p12     => sub { getP($_[0], 12) },
+    p29     => sub { getP($_[0], 29) },
+    p39     => sub { getP($_[0], 39) },
+    p45     => sub { getP($_[0], 45) },
+    p48     => sub { getP($_[0], 48) },
+    p58     => sub { getP($_[0], 58) },
+    r0      => sub { getR($_[0], 0)  },
+    r8      => sub { getR($_[0], 8)  },
+    r20     => sub { getR($_[0], 20) },
+    r28     => sub { getR($_[0], 28) },
+    r39s20  => sub { getR($_[0], 39) },
+    r39     => sub { getR($_[0], 39) },
+    r39a    => sub { getR($_[0], 39) }, # does not modify op code, xor the r39 value again to whipe it out, register must be in sequence with r20
+    c20     => sub { getC($_[0])     },
+    c39     => sub { getC($_[0])     },
+    c34     => sub { hex($_[0]) << 34 },
+    c36     => sub { hex($_[0]) << 36 },
+    f20w32  => sub { getF($_[0], 20, 'f')        },
+    f20     => sub { getF($_[0], 20, 'f', 12)    },
+    d20     => sub { getF($_[0], 20, 'd', 44)    },
+    i8w4    => sub { getI($_[0], 8,  0xf)        },
+    i20     => sub { getI($_[0], 20, 0x7ffff)    },
+    i20w6   => sub { getI($_[0], 20, 0x3f)       },
+    i20w7   => sub { getI($_[0], 20, 0x7f)       },
+    i20w8   => sub { getI($_[0], 20, 0xff)       },
+    i20w12  => sub { getI($_[0], 20, 0xfff)      },
+    i20w24  => sub { getI($_[0], 20, 0xffffff)   },
+    i20w32  => sub { getI($_[0], 20, 0xffffffff) },
+    i31w4   => sub { getI($_[0], 31, 0xf)        },
+    i34w13  => sub { getI($_[0], 34, 0x1fff)     },
+    i36w20  => sub { getI($_[0], 36, 0xfffff)    },
+    i39w8   => sub { getI($_[0], 39, 0xff)       },
+    i28w8   => sub { getI($_[0], 28, 0xff)       },
+    i28w20  => sub { getI($_[0], 28, 0xfffff)    },
+    i48w8   => sub { getI($_[0], 48, 0xff)       },
+    i51w5   => sub { getI($_[0], 51, 0x1f)       },
+    i53w5   => sub { getI($_[0], 53, 0x1f)       },
+);
+
+# Rules for operands and their closely tied flags
+my $hex     = qr"0[xX][0-9a-fA-F]+";
+my $iAddr   = qr"\d+[xX]<[^>]+>";
+my $immed   = qr"$hex|$iAddr|\d+"o;
+my $reg     = qr"[a-zA-Z_]\w*"; # must start with letter or underscore\
+my $p       = qr"P[0-6T]";
+my $noPred  = qr"(?<noPred>)";
+my $pred    = qr"\@(?<predNot>\!)?P(?<predNum>[0-6]) ";
+my $p0      = qr"(?<p0>$p)"o;
+my $p3      = qr"(?<p3>$p)"o;
+my $p12     = qr"(?<p12not>\!)?(?<p12>$p)"o;
+my $p29     = qr"(?<p29not>\!)?(?<p29>$p)"o;
+my $p39     = qr"(?<p39not>\!)?(?<p39>$p)"o;
+my $p45     = qr"(?<p45>$p)"o;
+my $p48     = qr"(?<p48>$p)"o;
+my $p58     = qr"(?<p58>$p)"o;
+my $r0      = qr"(?<r0>$reg)";
+my $r0cc    = qr"(?<r0>$reg)(?<CC>\.CC)?";
+my $r8      = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse1>\.reuse)?";
+my $r20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3|H0_H0|H1_H1))?(?<reuse2>\.reuse)?";
+my $r28     = qr"(?<r28>$reg)";
+my $r39s20  = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r39s20>(?<r20>$reg))\|?(?:\.(?<r39part>H0|H1))?(?<reuse2>\.reuse)?";
+my $r39     = qr"(?<r39neg>\-)?(?<r39>$reg)(?:\.(?<r39part>H0|H1))?(?<reuse3>\.reuse)?";
+my $r39a    = qr"(?<r39a>(?<r39>$reg))(?<reuse3>\.reuse)?";
+my $c20     = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20part>H0|H1|B0|B1|B2|B3))?"o;
+my $c20x    = qr"(?<r20neg>\-)?(?<r20abs>\|)?c\[(?<c34>$hex)\]\s*\[(?<c20>$hex)\]\|?(?:\.(?<r20partx>H0|H1|B0|B1|B2|B3))?"o;
+my $c20s39  = qr"(?<r39neg>\-)?c\[(?<c34>$hex)\]\s*\[(?<c39>$hex)\]"o;
+my $f20w32  = qr"(?<f20w32>(?:\-|\+|)(?i:$hex|inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))";
+my $f20     = qr"(?<f20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $d20     = qr"(?<d20>(?:(?<neg>\-)|\+|)(?i:inf\s*|\d+(?:\.\d+(?:e[\+\-]\d+)?)?))(?<r20neg>\.NEG)?"o;
+my $i8w4    = qr"(?<i8w4>$immed)"o;
+my $i20     = qr"(?<i20>(?<neg>\-)?$immed)(?<r20neg>\.NEG)?"o;
+my $i20w6   = qr"(?<i20w6>$immed)"o;
+my $i20w7   = qr"(?<i20w7>$immed)"o;
+my $i20w8   = qr"(?<i20w8>$immed)"o;
+my $i20w12  = qr"(?<i20w12>$immed)"o;
+my $i20w24  = qr"(?<i20w24>\-?$immed)"o;
+my $i20w32  = qr"(?<i20w32>\-?$immed)"o;
+my $i39w8   = qr"(?<i39w8>\-?$immed)"o;
+my $i28w8   = qr"(?<i28w8>$immed)"o;
+my $i28w20  = qr"(?<i28w20>\-?$immed)"o;
+my $i31w4   = qr"(?<i31w4>$immed)"o;
+my $i34w13  = qr"(?<i34w13>$immed)"o;
+my $i36w20  = qr"(?<i36w20>$immed)"o;
+my $i48w8   = qr"(?<i48w8>$immed)"o;
+my $i51w5   = qr"(?<i51w5>$immed)"o;
+my $i53w5   = qr"(?<i53w5>$immed)"o;
+my $ir20    = qr"$i20|$r20"o;
+my $cr20    = qr"$c20|$r20"o;
+my $icr20   = qr"$i20|$c20|$r20"o;
+my $fcr20   = qr"$f20|$c20|$r20"o;
+my $cr39    = qr"$c20s39|$r39"o;
+my $dr20    = qr"$d20|$r20"o;
+
+# Instruction specific rules for capturing various flags
+my $u32   = qr"(?<U32>\.U32)?";
+my $ftz   = qr"(?<FTZ>\.FTZ)?";
+my $sat   = qr"(?<SAT>\.SAT)?";
+my $rnd   = qr"(?:\.(?<rnd>RN|RM|RP|RZ))?";
+my $round = qr"(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?";
+my $fcmp  = qr"(?<cmp>\.LT|\.EQ|\.LE|\.GT|\.NE|\.GE|\.NUM|\.NAN|\.LTU|\.EQU|\.LEU|\.GTU|\.NEU|\.GEU|)";
+my $icmp  = qr"\.(?<cmp>LT|EQ|LE|GT|NE|GE)";
+my $bool  = qr"\.(?<bool>AND|OR|XOR|PASS_B)";
+my $bool2 = qr"\.(?<bool2>AND|OR|XOR)";
+my $func  = qr"\.(?<func>COS|SIN|EX2|LG2|RCP|RSQ|RCP64H|RSQ64H)";
+my $rro   = qr"\.(?<func>SINCOS|EX2)";
+my $add3  = qr"(?:\.(?<type>X|RS|LS))?";
+my $lopz  = qr"(?:\.(?<z>NZ|Z) $p48,|(?<noz>))"o;
+my $X     = qr"(?<X>\.X)?";
+my $tld   = qr"(?<NODEP>NODEP\.)?(?:(?<reuse1>T)|(?<reuse2>P))";
+my $chnls = qr"(?<chnls>R|RGBA)";
+my $sr    = qr"SR_(?<sr>\S+)";
+my $shf   = qr"(?<W>\.W)?(?:\.(?<type>U64|S64))?(?<HI>\.HI)?";
+my $xmad  = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<mode>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $xmadc = qr"(?:\.(?<type1>U16|S16))?(?:\.(?<type2>U16|S16))?(?:\.(?<modec>MRG|PSL|CHI|CLO|CSFU))?(?<CBCC>\.CBCC)?";
+my $vmad8 = qr"\.(?<sign1>[SU])(?<size1>8|16)\.(?<sign2>[SU])(?<size2>8|16)(?<PO>\.PO)?(?<SHR_7>\.SHR_7)?(?<SHR_15>\.SHR_15)?(?<SAT>\.SAT)?";
+my $vmad16= qr"\.(?<sign1>[SU])(?<size1>16)\.(?<sign2>[SU])(?<size2>16)";
+my $hilo  = qr"(?:\.(?<mode>XHI|XLO))?";
+my $vaddType = qr"(?:\.(?<UD>UD))?(?:\.(?<SD>SD))?(?:\.(?<sign1>[SU])(?<size1>8|16|32))?(?:\.(?<sign2>[SU])(?<size2>8|16|32))?";
+my $vaddMode = qr"(?:\.(?<mode>MRG_16[HL]|MRG_8B[0-3]|ACC|MIN|MAX))?";
+my $vmnmx = qr"(?:\.(?<MX>MX))?";
+my $x2x   = qr"\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)";
+my $prmt  = qr"(?:\.(?<mode>F4E|B4E|RC8|ECL|ECR|RC16))?";
+my $shfl  = qr"\.(?<mode>IDX|UP|DOWN|BFLY)";
+my $bar   = qr"\.(?<mode>SYNC|ARV|RED)(?:\.(?<red>POPC|AND|OR))? (?:$i8w4|$r8)(?:, (?:$i20w12|$r20))?(?(<r20>)|(?<nor20>))(?(<red>), $p39|(?<nop39>))"o;
+my $b2r   = qr"\.RESULT $r0(?:, $p45|(?<nop45>))"o;
+my $dbar  = qr"(?<SB>SB0|SB1|SB2|SB3|SB4|SB5)";
+my $dbar2 = qr" {(?<db5>5)?,?(?<db4>4)?,?(?<db3>3)?,?(?<db2>2)?,?(?<db1>1)?,?(?<db0>0)?}";
+my $mbar  = qr"\.(?<mode>CTA|GL|SYS)";
+my $addr  = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i20w24)?\]"o;
+my $addr2 = qr"\[(?:(?<r8>$reg)|(?<nor8>))(?:\s*\+?\s*$i28w20)?\]"o;
+my $ldc   = qr"c\[(?<c36>$hex)\]\s*$addr"o;
+my $atom  = qr"(?<E>\.E)?(?:\.(?<mode>ADD|MIN|MAX|INC|DEC|AND|OR|XOR|EXCH|CAS))(?<type>|\.S32|\.U64|\.F(?:16x2|32)\.FTZ\.RN|\.S64|\.64)";
+my $vote  = qr"\.(?<mode>ALL|ANY|EQ)"o;
+my $memType  = qr"(?<type>\.U8|\.S8|\.U16|\.S16||\.32|\.64|\.128)";
+my $memCache = qr"(?<E>\.E)?(?<U>\.U)?(?:\.(?<cache>CG|CI|CS|CV|IL|WT))?";
+
+
+
+# class: hardware resource that shares characteristics with types
+# lat  : pipeline depth where relevent, placeholder for memory ops
+# blat : barrier latency, typical fetch time for memory operations. Highly variable.
+# rlat : operand read latency for memory ops
+# rhold: clock cycles that a memory op typically holds onto a register before it's free to be written by another op.
+# tput : throughput, clock cycles an op takes when two ops of the same class are issued in succession.
+# dual : whether this instruction type can be dual issued
+# reuse: whether this instruction type accepts register reuse flags.
+
+# Some of these values are guesses and need to be updated from micro benchmarks.
+# We may need to split these classes up further.
+my $s2rT  = {class => 's2r',   lat => 2,   blat => 25,  rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $smemT = {class => 'mem',   lat => 2,   blat => 30,  rlat => 2, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $gmemT = {class => 'mem',   lat => 2,   blat => 200, rlat => 4, rhold => 20, tput => 1,   dual => 1, reuse => 0};
+my $x32T  = {class => 'x32',   lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 1};
+my $x64T  = {class => 'x64',   lat => 2,   blat => 128, rlat => 0, rhold => 0,  tput => 128, dual => 0, reuse => 1};
+my $shftT = {class => 'shift', lat => 6,   blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $cmpT  = {class => 'cmp',   lat => 13,  blat => 0,   rlat => 0, rhold => 0,  tput => 2,   dual => 0, reuse => 1};
+my $qtrT  = {class => 'qtr',   lat => 8,   blat => 0,   rlat => 4, rhold => 0,  tput => 1,   dual => 1, reuse => 0};
+my $rroT  = {class => 'rro',   lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+my $voteT = {class => 'vote',  lat => 2,   blat => 0,   rlat => 0, rhold => 0,  tput => 1,   dual => 0, reuse => 0};
+
+
+# Create map of op names to rules
+our %grammar =
+(
+    #Floating Point Instructions
+    FADD     => [ { type => $x32T,  code => 0x5c58000000000000, rule => qr"^$pred?FADD$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FADD32I  => [ { type => $x32T,  code => 0x0800000000000000, rule => qr"^$pred?FADD32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FCHK     => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?FCHK\.DIVIDE $p0, $r8, $r20;"o,                     } ], #Partial?
+    FCMP     => [ { type => $cmpT,  code => 0x5ba0000000000000, rule => qr"^$pred?FCMP$fcmp$ftz $r0, $r8, $fcr20, $r39;"o,            } ],
+    FFMA     => [
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $fcr20, $r39;"o,         },
+                  { type => $x32T,  code => 0x5980000000000000, rule => qr"^$pred?FFMA$ftz$rnd$sat $r0, $r8, $r39s20, $c20s39;"o,     },
+                ],
+    FMNMX    => [ { type => $shftT, code => 0x5c60000000000000, rule => qr"^$pred?FMNMX$ftz $r0, $r8, $fcr20, $p39;"o,                } ],
+    FMUL     => [ { type => $x32T,  code => 0x5c68000000000000, rule => qr"^$pred?FMUL$ftz$rnd$sat $r0, $r8, $fcr20;"o,               } ],
+    FMUL32I  => [ { type => $x32T,  code => 0x1e00000000000000, rule => qr"^$pred?FMUL32I$ftz $r0, $r8, $f20w32;"o,                   } ],
+    FSET     => [ { type => $shftT, code => 0x5800000000000000, rule => qr"^$pred?FSET$fcmp$ftz$bool $r0, $r8, $fcr20, $p39;"o,       } ],
+    FSETP    => [ { type => $cmpT,  code => 0x5bb0000000000000, rule => qr"^$pred?FSETP$fcmp$ftz$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ],
+    MUFU     => [ { type => $qtrT,  code => 0x5080000000000000, rule => qr"^$pred?MUFU$func $r0, $r8;"o,                              } ],
+    RRO      => [ { type => $rroT,  code => 0x5c90000000000000, rule => qr"^$pred?RRO$rro $r0, $r20;"o,                               } ],
+    DADD     => [ { type => $x64T,  code => 0x5c70000000000000, rule => qr"^$pred?DADD$rnd $r0, $r8, $dr20;"o,                        } ],
+    DFMA     => [ { type => $x64T,  code => 0x5b70000000000000, rule => qr"^$pred?DFMA$rnd $r0, $r8, $dr20, $r39;"o,                  } ],
+    DMNMX    => [ { type => $cmpT,  code => 0x5c50000000000000, rule => qr"^$pred?DMNMX $r0, $r8, $dr20, $p39;"o,                     } ],
+    DMUL     => [ { type => $x64T,  code => 0x5c80000000000000, rule => qr"^$pred?DMUL$rnd $r0, $r8, $dr20;"o,                        } ],
+    DSET     => [ { type => $cmpT,  code => 0x5900000000000000, rule => qr"^$pred?DSET$fcmp$bool $r0, $r8, $dr20, $p39;"o,            } ],
+    DSETP    => [ { type => $cmpT,  code => 0x5b80000000000000, rule => qr"^$pred?DSETP$fcmp$bool $p3, $p0, $r8, $dr20, $p39;"o,      } ],
+    FSWZADD  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?FSWZADD[^;]*;"o,                                    } ], #TODO
+
+    HADD2     => [ { type => $x32T,  code => 0x5d10000000000000, rule => qr"^$pred?HADD2$ftz $r0, $r8, $r20;"o,               } ],
+    HMUL2     => [ { type => $x32T,  code => 0x5d08000000000000, rule => qr"^$pred?HMUL2$ftz $r0, $r8, $r20;"o,               } ],
+    HFMA2     => [ { type => $x32T,  code => 0x5d00000000000000, rule => qr"^$pred?HFMA2$ftz $r0, $r8, $r20, $r39;"o,         } ],
+    HSETP2    => [ { type => $cmpT,  code => 0x5d20000000000000, rule => qr"^$pred?HSETP2$fcmp$bool $p3, $p0, $r8, $fcr20, $p39;"o, } ], #Partial
+
+    #Integer Instructions
+    BFE       => [ { type => $shftT,  code => 0x5c01000000000000, rule => qr"^$pred?BFE$u32 $r0, $r8, $icr20;"o,                          } ],
+    BFI       => [ { type => $shftT,  code => 0x5bf0000000000000, rule => qr"^$pred?BFI $r0, $r8, $ir20, $cr39;"o,                        } ],
+    FLO       => [ { type => $s2rT,   code => 0x5c30000000000000, rule => qr"^$pred?FLO\.U32 $r0, $icr20;"o,                              } ],
+    IADD      => [ { type => $x32T,   code => 0x5c10000000000000, rule => qr"^$pred?IADD$sat$X $r0cc, $r8, $icr20;"o,                         } ],
+    IADD32I   => [ { type => $x32T,   code => 0x1c00000000000000, rule => qr"^$pred?IADD32I$X $r0cc, $r8, $i20w32;"o,                         } ],
+    IADD3     => [ { type => $x32T,   code => 0x5cc0000000000000, rule => qr"^$pred?IADD3$add3 $r0cc, $r8, $icr20, $r39;"o,                 } ],
+    ICMP      => [ { type => $cmpT,   code => 0x5b41000000000000, rule => qr"^$pred?ICMP$icmp$u32 $r0, $r8, $icr20, $r39;"o,              } ],
+    IMNMX     => [ { type => $shftT,  code => 0x5c21000000000000, rule => qr"^$pred?IMNMX$u32$hilo $r0cc, $r8, $icr20, $p39;"o,                  } ],
+    ISET      => [ { type => $shftT,  code => 0x5b51000000000000, rule => qr"^$pred?ISET$icmp$u32$X$bool $r0, $r8, $icr20, $p39;"o,       } ],
+    ISETP     => [ { type => $cmpT,   code => 0x5b61000000000000, rule => qr"^$pred?ISETP$icmp$u32$X$bool $p3, $p0, $r8, $icr20, $p39;"o, } ],
+    ISCADD    => [ { type => $shftT,  code => 0x5c18000000000000, rule => qr"^$pred?ISCADD $r0, $r8, $icr20, $i39w8;"o,                   } ],
+    ISCADD32I => [ { type => $shftT,  code => 0x1400000000000000, rule => qr"^$pred?ISCADD32I $r0, $r8, $i20w32, $i53w5;"o,               } ],
+    LEA       => [
+                   { type => $cmpT,   code => 0x5bd0000000000000, rule => qr"^$pred?LEA $p48, $r0cc, $r8, $icr20;"o,                      },
+                   { type => $shftT,  code => 0x5bd7000000000000, rule => qr"^$pred?LEA $r0cc, $r8, $icr20, $i39w8;"o,                    },
+                   { type => $shftT,  code => 0x5bdf004000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $r20, $r39, $i28w8;"o,          },
+                   { type => $shftT,  code => 0x0a07000000000000, rule => qr"^$pred?LEA\.HI$X $r0cc, $r8, $c20, $r39, $i51w5;"o,          },
+                 ],
+    LOP       => [ { type => $x32T,   code => 0x5c40000000000000, rule => qr"^$pred?LOP$bool$lopz $r0, $r8, (?<INV>~)?$icr20(?<INV>\.INV)?;"o, } ],
+    LOP32I    => [ { type => $x32T,   code => 0x0400000000000000, rule => qr"^$pred?LOP32I$bool $r0, $r8, $i20w32;"o,                     } ],
+    LOP3      => [
+                   { type => $x32T,   code => 0x5be7000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $r20, $r39, $i28w8;"o,            },
+                   { type => $x32T,   code => 0x3c00000000000000, rule => qr"^$pred?LOP3\.LUT $r0, $r8, $i20, $r39, $i48w8;"o,            },
+                 ],
+    POPC      => [ { type => $s2rT,   code => 0x5c08000000000000, rule => qr"^$pred?POPC $r0, $r20;"o,                                    } ],
+    SHF       => [
+                   { type => $shftT,  code => 0x5bf8000000000000, rule => qr"^$pred?SHF\.L$shf $r0, $r8, $ir20, $r39;"o,                  },
+                   { type => $shftT,  code => 0x5cf8000000000000, rule => qr"^$pred?SHF\.R$shf $r0, $r8, $ir20, $r39;"o,                  },
+                 ],
+    SHL       => [ { type => $shftT,  code => 0x5c48000000000000, rule => qr"^$pred?SHL(?<W>\.W)? $r0, $r8, $icr20;"o,                    } ],
+    SHR       => [ { type => $shftT,  code => 0x5c29000000000000, rule => qr"^$pred?SHR$u32 $r0, $r8, $icr20;"o,                          } ],
+    XMAD      => [
+                   { type => $x32T,   code => 0x5b00000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $ir20, $r39;"o,                 },
+                   { type => $x32T,   code => 0x5900000000000000, rule => qr"^$pred?XMAD$xmad $r0cc, $r8, $r39s20, $c20s39;"o,            },
+                   { type => $x32T,   code => 0x5e00000000000000, rule => qr"^$pred?XMAD$xmadc $r0cc, $r8, $c20x, $r39;"o,                  },
+                 ],
+    # XMAD replaces these
+    IMAD      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMAD[^;]*;"o,   } ], #TODO
+    IMADSP    => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMADSP[^;]*;"o, } ], #TODO
+    IMUL      => [ { type => $x32T,   code => 0x0000000000000000, rule => qr"^$pred?IMUL[^;]*;"o,   } ], #TODO
+
+    #Conversion Instructions
+    F2F => [ { type => $qtrT,  code => 0x5ca8000000000000, rule => qr"^$pred?F2F$ftz$x2x$rnd$round$sat $r0, $cr20;"o, } ],
+    F2I => [ { type => $qtrT,  code => 0x5cb0000000000000, rule => qr"^$pred?F2I$ftz$x2x$round $r0, $cr20;"o,         } ],
+    I2F => [ { type => $qtrT,  code => 0x5cb8000000000000, rule => qr"^$pred?I2F$x2x$rnd $r0, $cr20;"o,               } ],
+    I2I => [ { type => $qtrT,  code => 0x5ce0000000000000, rule => qr"^$pred?I2I$x2x$sat $r0, $cr20;"o,               } ],
+
+    #Movement Instructions
+    MOV    => [ { type => $x32T,  code => 0x5c98078000000000, rule => qr"^$pred?MOV $r0, $icr20;"o,                   } ],
+    MOV32I => [ { type => $x32T,  code => 0x010000000000f000, rule => qr"^$pred?MOV32I $r0, (?:$i20w32|$f20w32);"o,   } ],
+    PRMT   => [ { type => $x32T,  code => 0x5bc0000000000000, rule => qr"^$pred?PRMT$prmt $r0, $r8, $icr20, $cr39;"o, } ],
+    SEL    => [ { type => $x32T,  code => 0x5ca0000000000000, rule => qr"^$pred?SEL $r0, $r8, $icr20, $p39;"o,        } ],
+    SHFL   => [ { type => $smemT, code => 0xef10000000000000, rule => qr"^$pred?SHFL$shfl $p48, $r0, $r8, (?:$i20w8|$r20), (?:$i34w13|$r39);"o, } ],
+
+    #Predicate/CC Instructions
+    PSET   => [ { type => $cmpT,  code => 0x5088000000000000, rule => qr"^$pred?PSET$bool2$bool $r0, $p12, $p29, $p39;"o,       } ],
+    PSETP  => [ { type => $cmpT,  code => 0x5090000000000000, rule => qr"^$pred?PSETP$bool2$bool $p3, $p0, $p12, $p29, $p39;"o, } ],
+    CSET   => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSET[^;]*;"o,  } ], #TODO
+    CSETP  => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?CSETP[^;]*;"o, } ], #TODO
+    P2R    => [ { type => $x32T,  code => 0x38e8000000000000, rule => qr"^$pred?P2R $r0, PR, $r8, $i20w7;"o,   } ],
+    R2P    => [ { type => $cmpT,  code => 0x38f0000000000000, rule => qr"^$pred?R2P PR, $r8, $i20w7;"o,   } ],
+
+    #Texture Instructions
+    # Handle the commonly used 1D texture functions.. but save the others for later
+    TLD    => [ { type => $gmemT, code => 0xdd38000000000000, rule => qr"^$pred?TLD\.B\.LZ\.$tld $r0, $r8, $r20, $hex, \dD, $i31w4;"o, } ], #Partial
+    TLDS   => [ { type => $gmemT, code => 0xda0000000ff00000, rule => qr"^$pred?TLDS\.LZ\.$tld $r28, $r0, $r8, $i36w20, \dD, $chnls;"o,} ], #Partial
+    TEX    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEX[^;]*;"o,   } ], #TODO
+    TLD4   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4[^;]*;"o,  } ], #TODO
+    TXQ    => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TXQ[^;]*;"o,   } ], #TODO
+    TEXS   => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TEXS[^;]*;"o,  } ], #TODO
+    TLD4S  => [ { type => $gmemT, code => 0x0000000000000000, rule => qr"^$pred?TLD4S[^;]*;"o, } ], #TODO
+
+    #Compute Load/Store Instructions
+    LD     => [ { type => $gmemT, code => 0x8000000000000000, rule => qr"^$pred?LD$memCache$memType $r0, $addr, $p58;"o,      } ],
+    ST     => [ { type => $gmemT, code => 0xa000000000000000, rule => qr"^$pred?ST$memCache$memType $addr, $r0, $p58;"o,      } ],
+    LDG    => [ { type => $gmemT, code => 0xeed0000000000000, rule => qr"^$pred?LDG$memCache$memType $r0, $addr;"o,           } ],
+    STG    => [ { type => $gmemT, code => 0xeed8000000000000, rule => qr"^$pred?STG$memCache$memType $addr, $r0;"o,           } ],
+    LDS    => [ { type => $smemT, code => 0xef48000000000000, rule => qr"^$pred?LDS$memCache$memType $r0, $addr;"o,           } ],
+    STS    => [ { type => $smemT, code => 0xef58000000000000, rule => qr"^$pred?STS$memCache$memType $addr, $r0;"o,           } ],
+    LDL    => [ { type => $gmemT, code => 0xef40000000000000, rule => qr"^$pred?LDL$memCache$memType $r0, $addr;"o,           } ],
+    STL    => [ { type => $gmemT, code => 0xef50000000000000, rule => qr"^$pred?STL$memCache$memType $addr, $r0;"o,           } ],
+    LDC    => [ { type => $gmemT, code => 0xef90000000000000, rule => qr"^$pred?LDC$memCache$memType $r0, $ldc;"o,            } ],
+    # Note for ATOM(S).CAS operations the last register needs to be in sequence with the second to last (as it's not encoded).
+    ATOM   => [ { type => $gmemT, code => 0xed00000000000000, rule => qr"^$pred?ATOM$atom $r0, $addr2, $r20(?:, $r39a)?;"o,   } ],
+    ATOMS  => [ { type => $smemT, code => 0xec00000000000000, rule => qr"^$pred?ATOMS$atom $r0, $addr2, $r20(?:, $r39a)?;"o,  } ],
+    RED    => [ { type => $gmemT, code => 0xebf8000000000000, rule => qr"^$pred?RED$atom $addr2, $r0;"o,                      } ],
+    CCTL   => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTL[^;]*;"o,  } ], #TODO
+    CCTLL  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLL[^;]*;"o, } ], #TODO
+    CCTLT  => [ { type => $x32T,  code => 0x5c88000000000000, rule => qr"^$pred?CCTLT[^;]*;"o, } ], #TODO
+
+    #Surface Memory Instructions (haven't gotten to these yet..)
+    SUATOM => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUATOM[^;]*;"o, } ], #TODO
+    SULD   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SULD[^;]*;"o,   } ], #TODO
+    SURED  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SURED[^;]*;"o,  } ], #TODO
+    SUST   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?SUST[^;]*;"o,   } ], #TODO
+
+    #Control Instructions
+    BRA    => [
+                { type => $x32T, code => 0xe24000000000000f, rule => qr"^$pred?BRA(?<U>\.U)? $i20w24;"o,         },
+                { type => $x32T, code => 0xe240000000000002, rule => qr"^$pred?BRA(?<U>\.U)? CC\.EQ, $i20w24;"o, },
+              ],
+    BRX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?BRX[^;]*;"o,                      } ], #TODO
+    JMP    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMP[^;]*;"o,                      } ], #TODO
+    JMX    => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?JMX[^;]*;"o,                      } ], #TODO
+    SSY    => [ { type => $x32T, code => 0xe290000000000000, rule => qr"^$noPred?SSY $i20w24;"o,                 } ],
+    SYNC   => [ { type => $x32T, code => 0xf0f800000000000f, rule => qr"^$pred?SYNC;"o,                          } ],
+    CAL    => [ { type => $x32T, code => 0xe260000000000040, rule => qr"^$noPred?CAL $i20w24;"o,                 } ],
+    JCAL   => [ { type => $x32T, code => 0xe220000000000040, rule => qr"^$noPred?JCAL $i20w24;"o,                } ],
+    PRET   => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PRET[^;]*;"o,                     } ], #TODO
+    RET    => [ { type => $x32T, code => 0xe32000000000000f, rule => qr"^$pred?RET;"o,                           } ],
+    BRK    => [ { type => $x32T, code => 0xe34000000000000f, rule => qr"^$pred?BRK;"o,                           } ],
+    PBK    => [ { type => $x32T, code => 0xe2a0000000000000, rule => qr"^$noPred?PBK $i20w24;"o,                 } ],
+    CONT   => [ { type => $x32T, code => 0xe35000000000000f, rule => qr"^$pred?CONT;"o,                          } ],
+    PCNT   => [ { type => $x32T, code => 0xe2b0000000000000, rule => qr"^$noPred?PCNT $i20w24;"o,                } ],
+    EXIT   => [ { type => $x32T, code => 0xe30000000000000f, rule => qr"^$pred?EXIT;"o,                          } ],
+    PEXIT  => [ { type => $x32T, code => 0x0000000000000000, rule => qr"^$pred?PEXIT[^;]*;"o,                    } ], #TODO
+    BPT    => [ { type => $x32T, code => 0xe3a00000000000c0, rule => qr"^$noPred?BPT\.TRAP $i20w24;"o,           } ],
+
+    #Miscellaneous Instructions
+    NOP    => [ { type => $x32T,  code => 0x50b0000000000f00, rule => qr"^$pred?NOP;"o,                                     } ],
+    CS2R   => [ { type => $x32T,  code => 0x50c8000000000000, rule => qr"^$pred?CS2R $r0, $sr;"o,                           } ],
+    S2R    => [ { type => $s2rT,  code => 0xf0c8000000000000, rule => qr"^$pred?S2R $r0, $sr;"o,                            } ],
+    B2R    => [ { type => $x32T,  code => 0xf0b800010000ff00, rule => qr"^$pred?B2R$b2r;"o,                                 } ],
+    BAR    => [ { type => $gmemT, code => 0xf0a8000000000000, rule => qr"^$pred?BAR$bar;"o,                                 } ],
+    DEPBAR => [
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$icmp $dbar, $i20w6;"o, },
+                { type => $gmemT, code => 0xf0f0000000000000, rule => qr"^$pred?DEPBAR$dbar2;"o,              },
+              ],
+    MEMBAR => [ { type => $x32T,  code => 0xef98000000000000, rule => qr"^$pred?MEMBAR$mbar;"o,                             } ],
+    VOTE   => [ { type => $voteT, code => 0x50d8000000000000, rule => qr"^$pred?VOTE$vote (?:$r0, |(?<nor0>))$p45, $p39;"o, } ],
+    R2B    => [ { type => $x32T,  code => 0x0000000000000000, rule => qr"^$pred?R2B[^;]*;"o,                                } ], #TODO
+
+    #Video Instructions... Need to finish
+    VADD   => [   { type => $shftT, code => 0x2044000000000000, rule => qr"^$pred?VADD$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMAD   => [
+                  { type => $x32T,  code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad16 $r0, $r8, $r20, $r39;"o, },
+                  { type => $shftT, code => 0x5f04000000000000, rule => qr"^$pred?VMAD$vmad8 $r0, $r8, $r20, $r39;"o, },
+              ],
+    VABSDIFF => [ { type => $shftT, code => 0x5427000000000000, rule => qr"^$pred?VABSDIFF$vaddType$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+    VMNMX    => [ { type => $shftT, code => 0x3a44000000000000, rule => qr"^$pred?VMNMX$vaddType$vmnmx$sat$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+
+    VSET => [ { type => $shftT, code => 0x4004000000000000, rule => qr"^$pred?VSET$icmp$vaddType$vaddMode $r0, $r8, $r20, $r39;"o, } ], #Partial 0x2044000000000000
+);
+
+# Create map of capture groups to op code flags that need to be added (or removed)
+my @flags = grep /\S/, split "\n", q{;
+
+BFE, BFI, FLO, IADD, IADD3, ICMP, IMNMX, ISCADD, ISET, ISETP, LEA, LOP, LOP3, MOV, PRMT, SEL, SHF, SHL, SHR, XMAD
+0x0100000000000000 neg
+
+FADD, FCMP, FFMA, FMNMX, FMUL, FSET, FSETP, DADD, DFMA, DMNMX, DMUL, DSET, DSETP
+0x0100000000000000 neg
+
+PSET, PSETP
+0x0000000000008000 p12not
+0x0000000100000000 p29not
+
+FMNMX, FSET, FSETP, DMNMX, DSET, DSETP, IMNMX, ISET, ISETP, SEL, PSET, PSETP, BAR, VOTE
+0x0000040000000000 p39not
+
+IADD, IADD3, XMAD, LEA, IMNMX
+0x0000800000000000 CC
+
+IADD32I
+0x0010000000000000 CC
+
+LEA
+0x0000000000000000 X
+
+SHF
+0x0004000000000000 W
+0x0001000000000000 HI
+
+SHF: type
+0x0000004000000000 U64
+0x0000006000000000 S64
+
+SHR, IMNMX, ISETP, ISET, ICMP, BFE
+0x0001000000000000 U32
+
+SHL
+0x0000008000000000 W
+
+SHFL
+0x0000000010000000 i20w8
+0x0000000020000000 i34w13
+
+SHFL: mode
+0x0000000000000000 IDX
+0x0000000040000000 UP
+0x0000000080000000 DOWN
+0x00000000c0000000 BFLY
+
+IMNMX: mode
+0x0000080000000000 XLO
+0x0000180000000000 XHI
+
+ISETP, ISET, ICMP: cmp
+0x0002000000000000 LT
+0x0004000000000000 EQ
+0x0006000000000000 LE
+0x0008000000000000 GT
+0x000a000000000000 NE
+0x000c000000000000 GE
+
+ISETP, ISET, PSETP, PSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+PSETP, PSET: bool2
+0x0000000000000000 AND
+0x0000000001000000 OR
+0x0000000002000000 XOR
+
+ISETP, ISET
+0x0000080000000000 X
+
+LOP: bool
+0x0000000000000000 AND
+0x0000020000000000 OR
+0x0000040000000000 XOR
+0x0000060000000000 PASS_B
+
+LOP:
+0x0000010000000000 INV
+
+LOP: z
+0x0000200000000000 Z
+0x0000300000000000 NZ
+
+LOP
+0x0007000000000000 noz
+
+LOP32I: bool
+0x0000000000000000 AND
+0x0020000000000000 OR
+0x0040000000000000 XOR
+
+PRMT: mode
+0x0001000000000000 F4E
+0x0002000000000000 B4E
+0x0003000000000000 RC8
+0x0004000000000000 ECL
+0x0005000000000000 ECR
+0x0006000000000000 RC16
+
+XMAD: type1
+0x0000000000000000 U16
+0x0001000000000000 S16
+
+XMAD: type2
+0x0000000000000000 U16
+0x0002000000000000 S16
+
+XMAD: mode
+0x0000002000000000 MRG
+0x0000001000000000 PSL
+0x0008000000000000 CHI
+0x0004000000000000 CLO
+0x000c000000000000 CSFU
+
+XMAD: modec
+0x0004000000000000 CLO
+0x0008000000000000 CHI
+0x000c000000000000 CSFU
+0x0040000000000000 X
+0x0080000000000000 PSL
+0x0100000000000000 MRG
+
+XMAD
+0x0010000000000000 CBCC
+
+XMAD: r8part
+0x0000000000000000 H0
+0x0020000000000000 H1
+
+XMAD: r20part
+0x0000000000000000 H0
+0x0000000800000000 H1
+
+XMAD: r20partx
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+XMAD: r39part
+0x0000000000000000 H0
+0x0010000000000000 H1
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r8part
+0x0000000000000000 B0
+0x0000001000000000 B1
+0x0000002000000000 B2
+0x0000003000000000 B3
+0x0000001000000000 H1
+0x0000000000000000 H0
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: r20part
+0x0000000000000000 B0
+0x0000000010000000 B1
+0x0000000020000000 B2
+0x0000000030000000 B3
+0x0000000010000000 H1
+0x0000000000000000 H0
+
+VMAD
+0x0040000000000000 r8neg
+0x0020000000000000 r39neg
+0x0008000000000000 SHR_7
+0x0010000000000000 SHR_15
+0x0060000000000000 PO
+0x0080000000000000 SAT
+
+VMNMX
+0x0100000000000000 MX
+
+VADD, VABSDIFF, VMNMX
+0x0080000000000000 SAT
+0x0040000000000000 UD
+0x0040000000000000 SD
+
+VSET: cmp
+0x0040000000000000 LT
+0x0080000000000000 EQ
+0x00c0000000000000 LE
+0x0100000000000000 GT
+0x0140000000000000 NE
+0x0180000000000000 GE
+
+VADD, VSET: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VABSDIFF: mode
+0x0003000000000000 ACC
+0x000b000000000000 MIN
+0x0013000000000000 MAX
+0x0023000000000000 MRG_16H
+0x002b000000000000 MRG_16L
+0x0033000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x003b000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMNMX: mode
+0x0020000000000000 ACC
+0x0028000000000000 MIN
+0x0030000000000000 MAX
+0x0000000000000000 MRG_16H
+0x0008000000000000 MRG_16L
+0x0010000000000000 MRG_8B0
+0x0000000000000000 MRG_8B1
+0x0018000000000000 MRG_8B2
+0x0000000000000000 MRG_8B3
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign1
+0x0000000000000000 U
+0x0001000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: sign2
+0x0000000000000000 U
+0x0002000000000000 S
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size1
+0x0000000000000000 8
+0x0000004000000000 16
+0x0000006000000000 32
+
+VMAD, VADD, VABSDIFF, VMNMX, VSET: size2
+0x0000000000000000 8
+0x0000000040000000 16
+0x0000000060000000 32
+
+IADD3: type
+0x0001000000000000 X
+0x0000002000000000 RS
+0x0000004000000000 LS
+
+IADD3: r8part
+0x0000000000000000 H0
+0x0000001000000000 H1
+
+IADD3: r20part
+0x0000000080000000 H0
+
+IADD3: r39part
+0x0000000200000000 H0
+
+IADD3
+0x0008000000000000 r8neg
+0x0004000000000000 r20neg
+0x0002000000000000 r39neg
+
+IADD
+0x0000080000000000 X
+0x0004000000000000 SAT
+
+IADD, ISCADD
+0x0002000000000000 r8neg
+0x0001000000000000 r20neg
+
+IADD32I
+0x0100000000000000 r8neg
+0x0020000000000000 X
+
+DEPBAR: SB
+0x0000000000000000 SB0
+0x0000000004000000 SB1
+0x0000000008000000 SB2
+0x000000000c000000 SB3
+0x0000000010000000 SB4
+0x0000000014000000 SB5
+
+DEPBAR: cmp
+0x0000000020000000 LE
+
+DEPBAR
+0x0000000000000001 db0
+0x0000000000000002 db1
+0x0000000000000004 db2
+0x0000000000000008 db3
+0x0000000000000010 db4
+0x0000000000000020 db5
+
+F2F, F2I, I2F, I2I: destWidth
+0x0000000000000000 8
+0x0000000000000100 16
+0x0000000000000200 32
+0x0000000000000300 64
+
+F2F, F2I, I2F, I2I: srcWidth
+0x0000000000000000 8
+0x0000000000000400 16
+0x0000000000000800 32
+0x0000000000000c00 64
+
+F2F, F2I, I2F, I2I: destSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000001000 S
+
+F2F, F2I, I2F, I2I: srcSign
+0x0000000000000000 F
+0x0000000000000000 U
+0x0000000000002000 S
+
+F2I, I2F, I2I: r20part
+0x0000000000000000 H0
+0x0000040000000000 H1
+0x0000000000000000 B0
+0x0000020000000000 B1
+0x0000040000000000 B2
+0x0000060000000000 B3
+
+F2F: r20part
+0x0000000000000000 H0
+0x0000020000000000 H1
+
+F2F: round
+0x0000040000000000 ROUND
+0x0000048000000000 FLOOR
+0x0000050000000000 CEIL
+0x0000058000000000 TRUNC
+
+F2I: round
+0x0000000000000000 ROUND
+0x0000008000000000 FLOOR
+0x0000010000000000 CEIL
+0x0000018000000000 TRUNC
+
+HADD2, HMUL2: r8part
+0x0001000000000000 H0_H0
+0x0000000000000000 H1_H1
+
+HFMA2: r20part
+0x0000000020000000 H0_H0
+0x0000000030000000 H1_H1
+
+FADD, DADD, FMUL, DMUL, F2F, I2F: rnd
+0x0000000000000000 RN
+0x0000008000000000 RM
+0x0000010000000000 RP
+0x0000018000000000 RZ
+
+DFMA: rnd
+0x0000000000000000 RN
+0x0004000000000000 RM
+0x0008000000000000 RP
+0x000c000000000000 RZ
+
+FFMA: rnd
+0x0000000000000000 RN
+0x0008000000000000 RM
+0x0010000000000000 RP
+0x0018000000000000 RZ
+
+FFMA
+0x0020000000000000 FTZ
+
+F2F, F2I, FADD, FMUL, FMNMX
+0x0000100000000000 FTZ
+
+FADD32I
+0x0080000000000000 FTZ
+
+FMUL32I
+0x0020000000000000 FTZ
+
+FSET
+0x0080000000000000 FTZ
+
+FSETP, FCMP
+0x0000800000000000 FTZ
+
+HADD2, HMUL2
+0x0000008000000000 FTZ
+
+HFMA2
+0x0000002000000000 FTZ
+
+FADD, FFMA, FMUL, F2F, I2I
+0x0004000000000000 SAT
+
+FADD, DADD, FMNMX, DMNMX, MUFU
+0x0001000000000000 r8neg
+
+FADD, DADD, FMNMX, DMNMX, RRO, F2F, F2I, I2F, I2I
+0x0000200000000000 r20neg
+
+FMUL, DMUL, FFMA, DFMA
+0x0001000000000000 r20neg
+
+FFMA, DFMA
+0x0002000000000000 r39neg
+
+FADD, DADD, FMNMX, DMNMX
+0x0000400000000000 r8abs
+
+FADD, DADD, FMNMX, DMNMX, F2F, F2I, I2F, I2I
+0x0002000000000000 r20abs
+
+FSETP, DSETP, FSET, DSET
+0x0000080000000000 r8neg
+0x0000000000000040 r20neg
+0x0000000000000080 r8abs
+0x0000100000000000 r20abs
+
+RRO: func
+0x0000000000000000 SINCOS
+0x0000008000000000 EX2
+
+MUFU: func
+0x0000000000000000 COS
+0x0000000000100000 SIN
+0x0000000000200000 EX2
+0x0000000000300000 LG2
+0x0000000000400000 RCP
+0x0000000000500000 RSQ
+0x0000000000600000 RCP64H
+0x0000000000700000 RSQ64H
+
+FSETP, DSETP, FSET, DSET, FCMP: cmp
+0x0001000000000000 .LT
+0x0002000000000000 .EQ
+0x0003000000000000 .LE
+0x0004000000000000 .GT
+0x0004000000000000
+0x0005000000000000 .NE
+0x0006000000000000 .GE
+0x0007000000000000 .NUM
+0x0008000000000000 .NAN
+0x0009000000000000 .LTU
+0x000a000000000000 .EQU
+0x000b000000000000 .LEU
+0x000c000000000000 .GTU
+0x000d000000000000 .NEU
+0x000e000000000000 .GEU
+
+FSETP, DSETP, FSET, DSET: bool
+0x0000000000000000 AND
+0x0000200000000000 OR
+0x0000400000000000 XOR
+
+HSETP2: cmp
+0x0000002800000000 .NE
+
+HSETP2: bool
+0x0000000000000000 AND
+
+S2R: sr
+0x0000000000000000 LANEID
+0x0000000000200000 VIRTCFG
+0x0000000000300000 VIRTID
+0x0000000002100000 TID.X
+0x0000000002200000 TID.Y
+0x0000000002300000 TID.Z
+0x0000000002500000 CTAID.X
+0x0000000002600000 CTAID.Y
+0x0000000002700000 CTAID.Z
+0x0000000003800000 EQMASK
+0x0000000003900000 LTMASK
+0x0000000003a00000 LEMASK
+0x0000000003b00000 GTMASK
+0x0000000003c00000 GEMASK
+
+CS2R: sr
+0x0000000005000000 CLOCKLO
+0x0000000005100000 CLOCKHI
+0x0000000005200000 GLOBALTIMERLO
+0x0000000005300000 GLOBALTIMERHI
+
+B2R
+0x0000e00000000000 nop45
+
+BAR
+0x0000100000000000 i8w4
+0x0000080000000000 nor20
+0x0000038000000000 nop39
+
+BAR: mode
+0x0000000000000000 SYNC
+0x0000000100000000 ARV
+0x0000000200000000 RED
+
+BAR: red
+0x0000000000000000 POPC
+0x0000000800000000 AND
+0x0000001000000000 OR
+
+MEMBAR: mode
+0x0000000000000000 CTA
+0x0000000000000100 GL
+0x0000000000000200 SYS
+
+VOTE: mode
+0x0000000000000000 ALL
+0x0001000000000000 ANY
+0x0002000000000000 EQ
+
+VOTE
+0x00000000000000ff nor0
+
+BRA
+0x0000000000000080 U
+
+TLDS: chnls
+0x0010000000000000 RGBA
+
+TLDS
+0x0002000000000000 NODEP
+
+LD, ST, LDG, STG, LDS, STS, LDL, STL, LDC, RED, ATOM, ATOMS
+0x000000000000ff00 nor8
+
+LD, ST: type
+0x0000000000000000 .U8
+0x0020000000000000 .S8
+0x0040000000000000 .U16
+0x0060000000000000 .S16
+0x0080000000000000
+0x0080000000000000 .32
+0x00a0000000000000 .64
+0x00c0000000000000 .128
+
+LD, ST: cache
+0x0100000000000000 CG
+0x0200000000000000 CS
+0x0300000000000000 CV
+0x0300000000000000 WT
+
+LDG, STG, LDS, STS, LDL, STL, LDC: type
+0x0000000000000000 .U8
+0x0001000000000000 .S8
+0x0002000000000000 .U16
+0x0003000000000000 .S16
+0x0004000000000000
+0x0004000000000000 .32
+0x0005000000000000 .64
+0x0006000000000000 .128
+
+LDG, STG: cache
+0x0000400000000000 CG
+0x0000800000000000 CI
+0x0000800000000000 CS
+0x0000c00000000000 CV
+0x0000c00000000000 WT
+
+LDL: cache
+0x0000200000000000 CI
+
+LDC: cache
+0x0000100000000000 IL
+
+LDG, STG, LDS, STS, LDL, STL, LDC
+0x0000200000000000 E
+
+LDS
+0x0000100000000000 U
+
+RED: type
+0x0000000000000000
+0x0000000000100000 .S32
+0x0000000000200000 .U64
+0x0000000000300000 .F32.FTZ.RN
+0x0000000000400000 .F16x2.FTZ.RN
+0x0000000000500000 .S64
+
+RED: mode
+0x0000000000000000 ADD
+0x0000000000800000 MIN
+0x0000000001000000 MAX
+0x0000000001800000 INC
+0x0000000002000000 DEC
+0x0000000002800000 AND
+0x0000000003000000 OR
+0x0000000003800000 XOR
+
+ATOM: type
+0x0000000000000000
+0x0002000000000000 .S32
+0x0004000000000000 .U64
+0x0006000000000000 .F32.FTZ.RN
+0x0008000000000000 .F16x2.FTZ.RN
+0x000a000000000000 .S64
+0x0002000000000000 .64
+
+ATOM, RED
+0x0001000000000000 E
+
+ATOM: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x03f0000000000000 CAS
+
+ATOMS: type
+0x0000000000000000
+0x0000000010000000 .S32
+0x0000000020000000 .U64
+0x0000000030000000 .S64
+0x0010000000000000 .64
+
+ATOMS: mode
+0x0000000000000000 ADD
+0x0010000000000000 MIN
+0x0020000000000000 MAX
+0x0030000000000000 INC
+0x0040000000000000 DEC
+0x0050000000000000 AND
+0x0060000000000000 OR
+0x0070000000000000 XOR
+0x0080000000000000 EXCH
+0x0240000000000000 CAS
+};
+
+# The existence of a capture group can map directly to an op code adjustment, or...
+# The named capture group value can map the op code adjustmemt from among several options
+our %flags;
+my (@ops, $flag);
+foreach my $line (@flags)
+{
+    if ($line =~ m'^(0x[0-9a-z]+)\s*(.*)')
+    {
+        my $val = hex($1);
+        # named rules (op: name)
+        if ($flag)
+            { $flags{$_}{$flag}{$2} = $val foreach @ops; }
+        # simple existence check rules
+        else
+            { $flags{$_}{$2}        = $val foreach @ops; }
+    }
+    else
+    {
+        my ($ops, $name) = split ':\s*', $line;
+        @ops = split ',\s*', $ops;
+        $flag = $name;
+    }
+}
+
+sub parseInstruct
+{
+    my ($inst, $grammar) = @_;
+    return unless $inst =~ $grammar->{rule};
+    my %capData = %+;
+    return \%capData;
+}
+
+# for immediate or constant operands and a given opcode, bits 56-63 get transformed
+my %immedOps = map { $_ => 1 } qw(i20 f20 d20);
+my %immedCodes =
+(
+    0x5c => 0x64,
+    0x5b => 0x6d,
+    0x59 => 0x6b,
+    0x58 => 0x68,
+);
+my %constCodes =
+(
+    c20 => 0x10,
+    c39 => 0x08,
+);
+my %reuseCodes = (reuse1 => 1, reuse2 => 2, reuse3 => 4);
+
+# just pick out the reuse code and nothing else
+sub genReuseCode
+{
+    my $capData = shift;
+    my $reuse = 0;
+    $reuse |= $reuseCodes{$_} foreach grep $capData->{$_}, keys %reuseCodes;
+    return $reuse;
+}
+
+# Generate an op code from regex capture data
+# if you pass in a test array ref it will populate it with the matching capture groups
+sub genCode
+{
+    my ($op, $grammar, $capData, $test) = @_;
+
+    my $flags     = $flags{$op};
+    my $code      = $grammar->{code};
+    my $reuse     = 0;
+    my $immedCode = $immedCodes{$code >> 56};
+
+    #print map "$_: $capData->{$_}\n", keys %capData if $op eq 'I2I';
+
+    # process the instruction predicate (if valid for this instuction)
+    if (exists $capData->{noPred})
+    {
+        delete $capData->{noPred};
+        push @$test, 'noPred' if $test;
+    }
+    else
+    {
+        my $p = defined($capData->{predNum}) ? $capData->{predNum} : 7;
+        push @$test, 'predNum' if $test;
+        if (exists $capData->{predNot})
+        {
+            $p |= 8;
+            push @$test, 'predNot' if $test;
+        }
+        $code ^= $p << 16;
+        delete @{$capData}{qw(predNum predNot)};
+
+    }
+    # process the register reuse flags
+    foreach my $rcode (qw(reuse1 reuse2 reuse3))
+    {
+        if (delete $capData->{$rcode})
+        {
+            $reuse |= $reuseCodes{$rcode};
+            push @$test, $rcode if $test;
+        }
+    }
+
+    foreach my $capture (keys %$capData)
+    {
+        # change the base code for immediate versions of the op
+        if (exists $immedOps{$capture})
+            { $code ^= $immedCode << 56; }
+        # change the base code for constant versions of the op
+        elsif (exists $constCodes{$capture})
+            { $code ^= $constCodes{$capture} << 56; }
+
+        # if capture group is an operand then process and add that data to code
+        if (exists $operands{$capture})
+        {
+            # don't process the r20 that comes with the r39s20 capture
+            unless ($capture eq 'r20' && exists $capData->{r39s20})
+            {
+                $code ^= $operands{$capture}->($capData->{$capture});
+                push @$test, $capture if $test;
+            }
+        }
+
+        # Add matching flags (an operand might also add/remove a flag)
+        if (exists $flags->{$capture})
+        {
+            # a named multivalue flag
+            if (ref $flags->{$capture})
+            {
+                $code ^= $flags->{$capture}{$capData->{$capture}};
+                push @$test, "$capture:$capData->{$capture}" if $test;
+            }
+            # a simple exists flag
+            else
+            {
+                $code ^= $flags->{$capture};
+                push @$test, $capture if $test;
+            }
+        }
+        elsif (!exists $operands{$capture} && !$test)
+        {
+            # Every capture group should be acted upon.  Missing one is a bug.
+            warn "UNUSED: $op: $capture: $capData->{$capture}\n";
+            warn Dumper($flags);
+        }
+    }
+
+    return $code, $reuse;
+}
+
+
+my $CtrlRe = qr'(?<ctrl>[0-9a-fA-F\-]{2}:[1-6\-]:[1-6\-]:[\-yY]:[0-9a-fA-F])';
+my $PredRe = qr'(?<pred>@!?(?<predReg>P\d)\s+)';
+my $InstRe = qr"$PredRe?(?<op>\w+)(?<rest>[^;]*;)"o;
+my $CommRe = qr'(?<comment>.*)';
+
+sub processAsmLine
+{
+    my ($line, $lineNum) = @_;
+
+    if ($line =~ m"^$CtrlRe(?<space>\s+)$InstRe$CommRe"o)
+    {
+        return {
+            lineNum => $lineNum,
+            pred    => $+{pred},
+            predReg => $+{predReg},
+            space   => $+{space},
+            op      => $+{op},
+            comment => $+{comment},
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            ctrl    => readCtrl($+{ctrl}, $line),
+        };
+    }
+    return undef;
+}
+
+sub processSassLine
+{
+    my $line = shift;
+
+    if ($line =~ m"^\s+/\*(?<num>[0-9a-f]+)\*/\s+$InstRe\s+/\* (?<code>0x[0-9a-f]+)"o)
+    {
+        return {
+            num     => hex($+{num}),
+            pred    => $+{pred},
+            op      => $+{op},
+            ins     => normalizeSpacing($+{op} . $+{rest}),
+            inst    => normalizeSpacing($+{pred} . $+{op} . $+{rest}),
+            code    => hex($+{code}),
+        };
+    }
+    return undef;
+}
+
+sub processSassCtrlLine
+{
+    my ($line, $ctrl, $ruse) = @_;
+
+    return 0 unless $line =~ m'^\s+\/\* (0x[0-9a-f]+)';
+
+    my $code = hex($1);
+    if (ref $ctrl)
+    {
+        push @$ctrl, ($code & 0x000000000001ffff) >> 0;
+        push @$ctrl, ($code & 0x0000003fffe00000) >> 21;
+        push @$ctrl, ($code & 0x07fffc0000000000) >> 42;
+    }
+    if (ref $ruse)
+    {
+        push @$ruse, ($code & 0x00000000001e0000) >> 17;
+        push @$ruse, ($code & 0x000003c000000000) >> 38;
+        push @$ruse, ($code & 0x7800000000000000) >> 59;
+    }
+    return 1;
+}
+
+sub replaceXMADs
+{
+    my $file = shift;
+
+# XMAD.LO d, a, b, c, x;
+# ----------------------
+# XMAD.MRG x, a, b.H1, RZ;
+# XMAD d, a, b, c;
+# XMAD.PSL.CBCC d, a.H1, x.H1, d;
+# ----------------------
+# XMAD d, a, 0xffff, c;
+# XMAD.PSL d, a.H1, 0xffff, d;
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD\.LO\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<x>\w+)\s*;$CommRe/
+
+        die "XMAD.LO: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD.MRG %8$s, %5$s, %6$s.H1, RZ;%9$s
+%1$s%2$s%3$sXMAD %4$s, %5$s, %6$s, %7$s;
+%1$s%2$s%3$sXMAD.PSL.CBCC %4$s, %5$s.H1, %8$s.H1, %4$s;',
+                @+{qw(ctrl space pred d a b c x comment)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>-?$immed|\w+)\s*,\s*(?<c>c\[$hex\]\[$hex\]|\w+)\s*;$CommRe/
+
+        die "XMAD.LO2: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s.H1, %6$s, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    $file =~ s/\n\s*$CtrlRe(?<space>\s+)($PredRe)?XMAD(?<mod>(?:\.[SU]16)(?:\.[SU]16))?\.LO2C\s+(?<d>\w+)\s*,\s*(?<a>\w+)\s*,\s*(?<b>c\[$hex\]\[$hex\]|\w+)\s*,\s*(?<c>\w+)\s*;$CommRe/
+
+        die "XMAD.LO2C: Destination and first operand cannot be the same register ($+{d})." if $+{d} eq $+{a};
+        sprintf '
+%1$s%2$s%3$sXMAD%9$s %4$s, %5$s, %6$s, %7$s;%8$s
+%1$s%2$s%3$sXMAD%9$s.PSL %4$s, %5$s, %6$s.H1, %4$s;',
+            @+{qw(ctrl space pred d a b c comment mod)}
+    /egmos;
+
+    #TODO: add more XMAD macros
+    return $file;
+}
+# convert extra spaces to single spacing to make our re's simplier
+sub normalizeSpacing
+{
+    my $inst = shift;
+    $inst =~ s/\t/ /g;
+    $inst =~ s/\s{2,}/ /g;
+    return $inst;
+}
+
+
+# map binary control notation on to easier to work with format.
+sub printCtrl
+{
+    my $code = shift;
+
+    my $stall = ($code & 0x0000f) >> 0;
+    my $yield = ($code & 0x00010) >> 4;
+    my $wrtdb = ($code & 0x000e0) >> 5;  # write dependency barier
+    my $readb = ($code & 0x00700) >> 8;  # read  dependency barier
+    my $watdb = ($code & 0x1f800) >> 11; # wait on dependency barier
+
+    $yield = $yield ? '-' : 'Y';
+    $wrtdb = $wrtdb == 7 ? '-' : $wrtdb + 1;
+    $readb = $readb == 7 ? '-' : $readb + 1;
+    $watdb = $watdb ? sprintf('%02x', $watdb) : '--';
+
+    return sprintf '%s:%s:%s:%s:%x', $watdb, $readb, $wrtdb, $yield, $stall;
+}
+sub readCtrl
+{
+    my ($ctrl, $context) = @_;
+    my ($watdb, $readb, $wrtdb, $yield, $stall) = split ':', $ctrl;
+
+    $watdb = $watdb eq '--' ? 0 : hex $watdb;
+    $readb = $readb eq '-'  ? 7 : $readb - 1;
+    $wrtdb = $wrtdb eq '-'  ? 7 : $wrtdb - 1;
+    $yield = $yield eq 'y' || $yield eq 'Y'  ? 0 : 1;
+    $stall = hex $stall;
+
+    die sprintf('wait dep out of range(0x00-0x3f): %x at %s',   $watdb, $context) if $watdb != ($watdb & 0x3f);
+
+    return
+        $watdb << 11 |
+        $readb << 8  |
+        $wrtdb << 5  |
+        $yield << 4  |
+        $stall << 0;
+}
+
+sub getRegNum
+{
+    my ($regMap, $regName) = @_;
+
+    return !exists($regMap->{$regName}) || ref($regMap->{$regName}) ? $regName : $regMap->{$regName};
+}
+
+sub getVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r0} or return;
+
+    return if $regName eq 'RZ';
+
+    if ($capData->{type} eq '.64' || $capData->{i31w4} eq '0x3')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    if ($capData->{type} eq '.128' || $capData->{i31w4} eq '0xf')
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+3);
+        }
+        confess "$regName not a 128bit vector register" unless exists($vectors->{$regName}) && @{$vectors->{$regName}} == 4;
+        return @{$vectors->{$regName}};
+    }
+    return $regName;
+}
+
+sub getAddrVecRegisters
+{
+    my ($vectors, $capData) = @_;
+    my $regName = $capData->{r8} or return;
+
+    return if $regName eq 'RZ';
+
+    if (exists $capData->{E})
+    {
+        if ($regName =~ m'^R(\d+)$')
+        {
+            return map "R$_", ($1 .. $1+1);
+        }
+        print Dumper($vectors) unless exists $vectors->{$regName};
+        confess "$regName not a 64bit vector register" unless exists $vectors->{$regName};
+        return @{$vectors->{$regName}}[0,1];
+    }
+    return $regName;
+}
+
+__END__
+
+
+
diff --git a/Assembler/PascalAs/microbench/microbench.cpp b/Assembler/PascalAs/microbench/microbench.cpp
new file mode 100644
index 0000000..7b0187a
--- /dev/null
+++ b/Assembler/PascalAs/microbench/microbench.cpp
@@ -0,0 +1,212 @@
+// microbench.cpp : Defines the entry point for the console application.
+//
+
+// nvcc -l cuda -o microbench microbench.cpp
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cuda.h>
+#include <cudaProfiler.h>
+
+CUcontext hContext = 0;
+
+#define CUDA_CHECK( fn ) do { \
+		CUresult status = (fn); \
+		if ( CUDA_SUCCESS != status ) { \
+			const char* errstr; \
+			cuGetErrorString(status, &errstr); \
+			printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \
+			if (hContext) cuCtxDestroy(hContext); \
+			exit(EXIT_FAILURE); \
+		} \
+	} while (0)
+
+
+int main(int argc, char* argv[])
+{
+	//int iTest = 2896;
+	//while (iTest < 0x7fff)
+	//{
+	//	int iResult = iTest * iTest;
+	//	float fTest = (float)iTest;
+	//	int fResult = (int)(fTest * fTest);
+
+	//	printf("i*i:%08x f*f:%08x\n", iResult, fResult);
+
+	//	iTest += 0x0800;
+	//}
+	//exit(0);
+
+	char deviceName[32];
+	int devCount, ordinal, major, minor;
+	CUdevice  hDevice;
+
+	// Initialize the Driver API and find a device
+	CUDA_CHECK( cuInit(0) );
+	CUDA_CHECK( cuDeviceGetCount(&devCount) );
+	for (ordinal = 0; ordinal < devCount; ordinal++)
+	{
+		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
+		CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) );
+		CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) );
+		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
+		if (major >= 5 && minor >= 2)
+		{
+			printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
+			break;
+		}
+	}
+	if (ordinal == devCount)
+	{
+		printf("No compute 5.0 device found, exiting.\n");
+		exit(EXIT_FAILURE);
+	}
+
+	// First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing
+	int internalTiming = 1;
+	if (argc > 1)
+		internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0;
+
+	// Second command line arg is the number of blocks
+	int blocks = 1;
+	if (argc > 2)
+		blocks = atoi(argv[2]);
+	if (blocks < 1)
+		blocks = 1;
+
+	// Third command line arg is the number of threads
+	int threads = 128;
+	if (argc > 3)
+		threads = atoi(argv[3]);
+	if (threads > 1024 || threads < 32)
+		threads = 128;
+	threads &= -32;
+
+	// Forth command line arg:
+	double fops = 1.0;
+	int lanes = 1;
+	if (argc > 4)
+	{
+		if (internalTiming)
+		{
+			// The number of lanes to print for each warp
+			lanes = atoi(argv[4]);
+			if (lanes > 32 || lanes < 1)
+				lanes = 1;
+		}
+		else
+			// The number of floating point operations in a full kernel launch
+			fops = atof(argv[4]);
+	}
+
+	// Fifth command line arg is the repeat count for benchmarking
+	int repeat = 1;
+	if (argc > 5)
+		repeat = atoi(argv[5]);
+	if (repeat > 1000 || repeat < 1)
+		repeat = 1;
+
+	// threads = total number of threads
+	size_t size = sizeof(int) * threads * blocks;
+
+	// Setup our input and output buffers
+	int* dataIn  = (int*)malloc(size);
+	int* dataOut = (int*)malloc(size);
+	int* clocks  = (int*)malloc(size);
+	memset(dataIn, 0, size);
+
+	CUmodule hModule;
+	CUfunction hKernel;
+	CUevent hStart, hStop;
+	CUdeviceptr devIn, devOut, devClocks;
+
+	// Init our context and device memory buffers
+	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
+	CUDA_CHECK( cuMemAlloc(&devIn, size) );
+	CUDA_CHECK( cuMemAlloc(&devOut, size) );
+	CUDA_CHECK( cuMemAlloc(&devClocks, size) );
+	CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) );
+	CUDA_CHECK( cuMemsetD8(devOut, 0, size) );
+	CUDA_CHECK( cuMemsetD8(devClocks, 0, size) );
+
+	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) );
+	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );
+
+	// Load our kernel
+	CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") );
+	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") );
+
+	// Setup the params
+	void* params[] = { &devOut, &devClocks, &devIn };
+	float ms = 0;
+
+	// Warm up the clock (unless under nsight)
+	if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER
+		for (int i = 0; i < repeat; i++)
+			CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
+
+	// Launch the kernel
+	CUDA_CHECK( cuEventRecord(hStart, NULL) );
+	//CUDA_CHECK( cuProfilerStart() );
+	CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
+	//CUDA_CHECK( cuProfilerStop() );
+	CUDA_CHECK( cuEventRecord(hStop, NULL) );
+	CUDA_CHECK( cuEventSynchronize(hStop) );
+	CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
+
+	//CUDA_CHECK( cuCtxSynchronize() );
+
+	// Get back our results from each kernel
+	CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) );
+	CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) );
+
+	// Cleanup and shutdown of cuda
+	CUDA_CHECK( cuEventDestroy(hStart) );
+	CUDA_CHECK( cuEventDestroy(hStop) );
+	CUDA_CHECK( cuModuleUnload(hModule) );
+	CUDA_CHECK( cuMemFree(devIn) );
+	CUDA_CHECK( cuMemFree(devOut) );
+	CUDA_CHECK( cuMemFree(devClocks) );
+	CUDA_CHECK( cuCtxDestroy(hContext) );
+	hContext = 0;
+
+	// When using just one block, print out the internal timing data
+	if (internalTiming)
+	{
+		int count = 0, total = 0, min = 999999, max = 0;
+
+		int* clocks_p  = clocks;
+		int* dataOut_p = dataOut;
+
+		// Loop over and print results
+		for (int blk = 0; blk < blocks; blk++)
+		{
+			float *fDataOut = reinterpret_cast<float*>(dataOut_p);
+
+			for(int tid = 0; tid < threads; tid += 32)
+			{
+				// Sometimes we want data on each thread, sometimes just one sample per warp is fine
+				for (int lane = 0; lane < lanes; lane++)
+					printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u
+
+				count++;
+				total += clocks_p[tid];
+				if (clocks_p[tid] < min) min = clocks_p[tid];
+				if (clocks_p[tid] > max) max = clocks_p[tid];
+			}
+			clocks_p  += threads;
+			dataOut_p += threads;
+		}
+		printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max);
+	}
+	else
+	{
+		// For more than one block we're testing throughput and want external timing data
+		printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0));
+	}
+	// And free up host memory
+	free(dataIn); free(dataOut); free(clocks);
+
+	return 0;
+}
diff --git a/Assembler/PascalAs/microbench/microbench.cu b/Assembler/PascalAs/microbench/microbench.cu
new file mode 100644
index 0000000..7d4cd8f
--- /dev/null
+++ b/Assembler/PascalAs/microbench/microbench.cu
@@ -0,0 +1,69 @@
+
+// Note this file isn't configured to automatically compile
+
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+
+// Build:
+// nvcc -l cuda -o microbench microbench.cpp
+// nvcc -arch sm_50 -cubin microbench.cu
+
+// Inspect a cubin (use nvdisasm from cuda 6.5 for best results):
+// maxas.pl -e microbench.cubin
+
+// Insert new sass into cubin
+// maxas.pl -i microbench.sass microbench.cubin
+
+// run it:
+// ./microbench
+
+// Use extern C so C++ doesn't mangle our kernel name
+extern "C" __global__ void  microbench(int *out, int *clocks, int *in)
+{
+    __shared__ int share[1024];
+
+    int tid = threadIdx.x;
+    int bx  = blockIdx.x;
+    int by  = blockIdx.y;
+
+    int start = clock();
+
+    share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ
+
+    __syncthreads();
+
+    int end = clock();
+
+    clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start;
+
+    out[tid] = share[tid ^ 1];
+}
+
+// A note about using the Cuda Runtime.
+// If that's your preference over the driver API then here's what you'd do:
+
+// In your project properties in the Cuda C/C++ panel:
+//    -Set the "Keep Processed Files" (-keep) option
+//    -Add a -v manually to the command line
+// If compiling on command line just add -keep -v options to nvcc.
+// Rebuild your solution and look in the log for these lines that follow the ptxas step:
+
+// #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
+// #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
+// #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
+
+// You just need to manually run these 3 commands (or add them to a build script)
+// after you've modified the cubin generated from the preceeding ptxas command.
+// That will give you a new .cu.obj file which will automatically be linked in for you next time you
+// build your project (or you could manually run the linker step as well).
+
+// Having done that you can call your kernel normally using the <<< >>> syntax.
+// Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
+// With fatbin you can also keep non-maxwell optimized versions of your code.
+
+
+// I just discovered this also works as a shortcut to the above:
+// nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu
+
+// The cu kernel definitions above need to have empty bodies.
+// And, the cu file must be compiled to a lib seperately before linking.
\ No newline at end of file
diff --git a/Assembler/PascalAs/microbench/microbench.sass b/Assembler/PascalAs/microbench/microbench.sass
new file mode 100644
index 0000000..609274a
--- /dev/null
+++ b/Assembler/PascalAs/microbench/microbench.sass
@@ -0,0 +1,72 @@
+# Kernel: microbench
+
+// This is a simple micro bench to demonstrate the latency in loading SR_TID.X
+
+<CONSTANT_MAPPING>
+    blockDimX : c[0x0][0x08]
+    blockDimY : c[0x0][0x0c]
+    blockDimZ : c[0x0][0x10]
+    gridDimX  : c[0x0][0x14]
+    gridDimY  : c[0x0][0x18]
+    gridDimZ  : c[0x0][0x1c]
+
+    param_out[0]    : c[0x0][0x140]
+    param_out[1]    : c[0x0][0x144]
+    param_clocks[0] : c[0x0][0x148]
+    param_clocks[1] : c[0x0][0x14c]
+    param_in[0]     : c[0x0][0x150]
+    param_in[1]     : c[0x0][0x154]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     0-1 : out<0-1>
+     2-3 : clocks<0-1>
+     4-5 : in<0-1>
+    6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x
+
+</REGISTER_MAPPING>
+
+// Load in our params (not currently used below)
+--:-:-:-:1      MOV in0, param_in[0];
+--:-:-:-:1      MOV in1, param_in[1];
+
+// Get the first clock value
+--:-:-:-:1      CS2R clock1, SR_CLOCKLO;
+
+// Get the threadId and blockId
+// Set the Read-After-Write dependency barrier 1 and 2
+--:-:1:-:1      S2R tid, SR_TID.X;
+// Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it
+--:-:2:-:2      S2R bid, SR_CTAID.X;
+
+
+// Get the second clock value
+// Wait on the depenedency barriers that were set in the prior instruction
+// Stall 6 to allow CS2R time to complete before next instruction
+// CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks
+// This stall count does not factor into the time calculation at all
+03:-:-:-:6      CS2R clock2, SR_CLOCKLO;
+
+// Take the difference of clocks
+--:-:-:-:1      IADD clock1, clock2, -clock1;
+
+// Setup our output addresses
+// Stall your pipeline dependencies properly
+// Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code
+--:-:-:-:6      XMAD offset, bid, blockDimX, tid;
+
+// LEA is "load effective address"
+// The offset param is shifted left 2 and added to the pointers with 64bit math
+--:-:-:-:6      LEA      clocks0.CC, offset, param_clocks[0],     2;
+--:-:-:-:1      LEA.HI.X clocks1,    offset, param_clocks[1], RZ, 2;
+
+--:-:-:-:6      LEA      out0.CC, offset, param_out[0],     2;
+--:-:-:-:1      LEA.HI.X out1,    offset, param_out[1], RZ, 2;
+
+// Output the results.
+// No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
+--:-:-:-:1      STG.E [clocks], clock1;
+--:-:-:-:1      STG.E [out],    offset; # use this to return whatever you like to inspect the results
+--:-:-:-:5      EXIT;
+
diff --git a/Assembler/PascalAs/microbench/shared.pl b/Assembler/PascalAs/microbench/shared.pl
new file mode 100755
index 0000000..f760664
--- /dev/null
+++ b/Assembler/PascalAs/microbench/shared.pl
@@ -0,0 +1,12 @@
+#!/usr/bin/perl
+use strict;
+
+print `maxas.pl -i shared_sts16.sass microbench.cubin`;
+
+exit if $?;
+
+print `Release\\microbench.exe i 1 64`;
+
+
+__END__
+
diff --git a/Assembler/PascalAs/microbench/shared_lds.sass b/Assembler/PascalAs/microbench/shared_lds.sass
new file mode 100644
index 0000000..5f31dcf
--- /dev/null
+++ b/Assembler/PascalAs/microbench/shared_lds.sass
@@ -0,0 +1,122 @@
+# Kernel: microbench
+# InsCnt: 18
+# RegCnt: 5
+# SharedSize: 4096
+# BarCnt: 1
+# Params(3):
+#   ord:addr:size:align
+#   0:0x140:4:0
+#   1:0x144:4:0
+#   2:0x148:4:0
+
+// This is a simple micro bench to demonstrate the latency in loading SR_TID.X
+
+<REGISTER_MAPPING>
+
+    0-3 : result, a, b, c
+
+    4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20>
+
+</REGISTER_MAPPING>
+
+// Load in our params
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R bid,      SR_CTAID.X;
+
+--:-:-:-:1      MOV result,  c[0x0][0x0];
+--:-:-:-:1      MOV in,      c[0x0][0x100];
+
+--:-:-:-:1      CS2R clock1, SR_CLOCKLO;
+--:-:-:-:1      MOV result,  c[0x0][0x13c];
+--:-:-:-:1      CS2R clock2, SR_CLOCKLO;
+
+--:-:-:-:1      MOV blockDim, c[0x0][0x8];
+--:-:-:-:1      MOV out,      c[0x0][0x140];
+--:-:-:-:1      MOV clocks,   c[0x0][0x144];
+
+
+
+
+<SCHEDULE_BLOCK>
+
+03:-:-:-:1      LOP.AND tid3,   tid, 3;
+--:-:-:-:1      LOP.AND tid7,   tid, 7;
+--:-:-:-:1      LOP.AND tid96,  tid, 96;
+--:-:-:-:1      LOP.AND tid128, tid, 128;
+
+// readAs = ((tid128 >> 4) | tid7) << 4
+--:-:-:-:1      SHR.U32 readAs, tid128, 4;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid7;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = ((tid96 >> 3) | tid3) << 4
+--:-:-:-:1      SHR.U32 readBs, tid96, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid3;
+#--:-:-:-:1      SHL     readBs, readBs, 4;
+#--:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
+
+
+</SCHEDULE_BLOCK>
+
+
+
+#--:-:-:-:1      LDS.U.128 result, [readBs];
+
+
+
+
+01:-:-:-:1      IADD clock1, clock2, -clock1;
+
+
+--:-:-:-:1      XMAD tid, blockDim, bid, tid;
+--:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
+--:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
+--:-:-:Y:6      SHL  tid, tid, 0x2;
+
+--:-:-:-:1      IADD clocks, clocks, tid;
+--:-:-:-:2      IADD out,  out,  tid;
+
+--:-:-:-:1      STG [clocks], clock1;
+--:-:-:-:1      STG [out],    readBs;
+--:-:-:-:5      EXIT;
+
+<COMMENT>
+
+--:-:-:-:4      LOP.AND tid32, tid, -32;
+
+--:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
+
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+
+
+// readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
+--:-:-:-:1      LOP.AND readAs, tid,    0x80;
+--:-:-:-:1      SHR.U32 readAs, readAs, 4;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid7;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid1,   tid,    0x1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
+
+
+</COMMENT>
\ No newline at end of file
diff --git a/Assembler/PascalAs/microbench/shared_sts16.sass b/Assembler/PascalAs/microbench/shared_sts16.sass
new file mode 100644
index 0000000..2f6eb39
--- /dev/null
+++ b/Assembler/PascalAs/microbench/shared_sts16.sass
@@ -0,0 +1,116 @@
+# Kernel: microbench
+# InsCnt: 18
+# RegCnt: 5
+# SharedSize: 4096
+# BarCnt: 1
+# Params(3):
+#   ord:addr:size:align
+#   0:0x140:4:0
+#   1:0x144:4:0
+#   2:0x148:4:0
+
+// This is a simple micro bench to demonstrate the latency in loading SR_TID.X
+
+<REGISTER_MAPPING>
+
+    0-3 : result, a, b, c
+
+    4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20>
+
+</REGISTER_MAPPING>
+
+// Load in our params
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R bid,      SR_CTAID.X;
+
+//--:-:-:-:1      MOV result,  c[0x0][0x0];
+//--:-:-:-:1      MOV in,      c[0x0][0x100];
+--:-:-:-:1      MOV result, 1;
+
+--:-:-:-:1      MOV blockDim, c[0x0][0x8];
+--:-:-:-:1      MOV out,      c[0x0][0x140];
+--:-:-:-:1      MOV clocks,   c[0x0][0x144];
+
+
+// readAs = ((tid >> 1) & 7) << 4;
+03:-:-:-:6      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:6      SHL     readAs, readAs, 3;
+
+// readBs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024;
+--:-:-:-:6      LOP.AND tid1,   tid,    1;
+--:-:-:-:6      LOP.AND readBs, tid,    0x30;
+--:-:-:-:6      SHR.U32 readBs, readBs, 3;
+--:-:-:-:6      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:6      ISCADD  readBs, readBs, 0, 3;
+
+
+
+///--:-:-:-:1      STS [tid32], result;
+//--:-:-:-:1      STS.S16 [tid32 + 2x<32>], result;
+//--:-:1:-:2      LDS.U.64 result, [readBs];
+
+--:-:-:-:0      CS2R clock1, SR_CLOCKLO;
+--:-:1:-:6      LDS.U.64 result, [readAs];
+--:-:-:-:6      CS2R clock2, SR_CLOCKLO;
+
+
+01:-:-:-:1      IADD clock1, clock2, -clock1;
+
+
+--:-:-:-:1      XMAD tid, blockDim, bid, tid;
+--:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
+--:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
+--:-:-:Y:6      SHL  tid, tid, 0x2;
+
+--:-:-:-:1      IADD clocks, clocks, tid;
+--:-:-:-:2      IADD out,  out,  tid;
+
+--:-:-:-:1      STG [clocks], clock1;
+--:-:-:-:1      STG [out],    result;
+--:-:-:-:5      EXIT;
+
+<COMMENT>
+
+--:-:-:-:4      LOP.AND tid32, tid, -32;
+
+--:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
+
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+--:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
+
+03:-:-:-:6      LOP.AND  tid31, tid, 31;
+--:-:-:-:6      LOP.AND  tid32, tid, 32;
+--:-:-:-:6      SHL  tid32, tid32, 0x2;
+--:-:-:-:6      LOP.OR  tid32, tid32, tid31;
+--:-:-:-:6      SHL  tid32, tid32, 0x2;
+
+// readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
+--:-:-:-:1      LOP.AND readAs, tid,    0x80;
+--:-:-:-:1      SHR.U32 readAs, readAs, 4;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid7;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid1,   tid,    0x1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
+
+
+</COMMENT>
\ No newline at end of file
diff --git a/Assembler/PascalAs/microbench/throughput.pl b/Assembler/PascalAs/microbench/throughput.pl
new file mode 100755
index 0000000..56df6e7
--- /dev/null
+++ b/Assembler/PascalAs/microbench/throughput.pl
@@ -0,0 +1,80 @@
+#!/usr/bin/perl
+use strict;
+
+my $loopSize  = 512;
+my $blocks    = 32;
+my $loops     = 10240000;
+my $fileName  = 'throughput2.sass';
+
+writeSassFile($fileName, $loops);
+
+#print `maxas.pl -p $fileName`;
+#exit;
+
+print `maxas.pl -i $fileName microbench.cubin`;
+exit if $?;
+
+foreach my $thread128 (2)
+{
+    my $threads   = $thread128 * 128;
+    my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
+
+    my $data = `Release\\microbench.exe e $blocks $threads $fops`;
+
+    my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+    printf "%d %d %d\n", $thread128, $threads, $gflops;
+}
+
+exit;
+
+sub writeSassFile
+{
+    my ($filename, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'EOF', $loops;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+    0-10 : result, r1, r2, r3
+    20-27 ~ count, stop
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+--:-:-:-:1      MOV32I r1, 1.0;
+--:-:-:-:1      MOV32I r2, 1.0;
+--:-:-:-:4      MOV32I r3, 1.0;
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0 .. 511)
+    {
+        my $yield = ($i + 32) & 63 ? '-' : 'Y';
+
+        my $stall = $i == 511 ? 0 : 1;
+
+        $out .= "--:-:-:$yield:$stall      FFMA result, r1, r2, r3;\n";
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+EOF
+
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/PascalAs/microbench/throughput.sass b/Assembler/PascalAs/microbench/throughput.sass
new file mode 100644
index 0000000..796502f
--- /dev/null
+++ b/Assembler/PascalAs/microbench/throughput.sass
@@ -0,0 +1,95 @@
+# Kernel: microbench
+# InsCnt: 18
+# RegCnt: 5
+# SharedSize: 4096
+# BarCnt: 1
+# Params(3):
+#   ord:addr:size:align
+#   0:0x140:4:0
+#   1:0x144:4:0
+#   2:0x148:4:0
+
+<REGISTER_MAPPING>
+
+    8-20 : count
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV R0, RZ;
+--:-:-:-:1      MOV R1, RZ;
+--:-:-:-:1      MOV R2, RZ;
+--:-:-:-:1      MOV R3, RZ;
+--:-:-:-:1      MOV R4, RZ;
+--:-:-:-:1      MOV R5, RZ;
+--:-:-:-:1      MOV R6, RZ;
+--:-:-:-:1      MOV R7, RZ;
+--:-:-:-:1      MOV R8, RZ;
+--:-:-:Y:6      MOV count, RZ;
+
+// This loop is capable of running at 1700 GFlops on GM107.
+// You can tweak it to see how register bank conflicts or different control codes
+// effect performance.
+// With thoughput.pl you can pass params to this code and do some autotuning.
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, 0x19000, PT;
+--:-:-:-:1      IADD count, count, 0x1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0..511) #511
+    {
+        my $y = ($i + 32) & 63 ? '-' : 'Y';
+
+        $out .= qq|
+--:-:-:$y:1      FFMA R0, R1, R2, R3;|; #c[0x0][$c]
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+
+--:-:-:-:5      EXIT;
+
+<COMMENT>
+
+
+    open my $fh, 'params.txt';
+    my $line = <$fh>;
+    close $fh;
+    my ($r1, $r2, $r3) = split "\t", $line;
+
+    80-95 : out, clocks, in, tid, clock1, clock2, result
+
+
+--:-:1:-:1      S2R tid,   SR_TID.X;
+--:-:-:-:1      MOV out,    c[0x0][0x140];
+--:-:-:-:1      MOV clocks, c[0x0][0x144];
+01:-:-:-:1      MOV in,     c[0x0][0x148];
+
+
+
+--:-:-:-:1      MOV32I f0, 0x3f800000;
+--:-:-:-:1      MOV32I f1, 0x3f800000;
+--:-:-:-:1      MOV32I f2, 0x3f800000;
+--:-:-:-:5      MOV32I f3, 0x3f800000;
+
+--:-:-:-:1      CS2R clock1, SR_CLOCKLO;
+
+
+--:-:-:-:1      CS2R clock2, SR_CLOCKLO;
+
+--:-:-:-:6      MOV32I result, 0x457;
+--:-:-:-:1      IADD clock1, clock2, -clock1;
+
+
+--:-:-:-:6      SHL  tid, tid, 0x2;
+--:-:-:-:1      IADD clocks, clocks, tid;
+--:-:-:-:1      IADD out,  out,  tid;
+
+--:-:-:-:1      STG [clocks], clock1;
+--:-:-:-:1      STG [out],    R24;
+
+
+</COMMENT>
\ No newline at end of file
diff --git a/Assembler/PascalAs/microbench/throughput2.pl b/Assembler/PascalAs/microbench/throughput2.pl
new file mode 100755
index 0000000..ea7e19f
--- /dev/null
+++ b/Assembler/PascalAs/microbench/throughput2.pl
@@ -0,0 +1,158 @@
+#!/usr/bin/perl
+use strict;
+my %p;
+
+$p{N}         = 8192;
+$p{blocking}  = 8;
+$p{unroll}    = 8;
+$p{threads}   = 64;   #256
+
+$p{csize}     = $p{blocking} * $p{blocking};
+$p{loopSize}  = $p{unroll} * $p{csize};
+$p{width}     = sqrt($p{csize} * $p{threads});
+$p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
+$p{loops}     = $p{N} / $p{unroll};
+$p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
+
+my $fileName  = 'throughput2.sass';
+
+my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
+
+#print join("\t", @params), "\n";
+#print join("\t", @p{@params}), "\n";
+
+print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
+
+writeSassFile($fileName, $p{loopSize}, $p{loops});
+
+#print `maxas.pl -p $fileName`;
+#exit;
+
+print `maxas.pl -i $fileName microbench.cubin`;
+
+exit if $?;
+
+my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
+
+my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+print $data;
+
+#printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
+
+
+
+
+sub writeSassFile
+{
+    my ($filename, $loopSize, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'END_SASS', $loops;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
+    35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
+    39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
+    33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
+    37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
+
+    64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67>
+    80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67>
+
+    0-127 : r<0-127>
+
+    100-101 : count, stop
+
+    //102-112 ~ readAs, readBs, writeS
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+//--:-:-:-:1      MOV writeS, RZ;
+//--:-:-:-:1      MOV readAs, RZ;
+//--:-:-:-:1      MOV readBs, RZ;
+
+<CODE>
+    return join '', map "--:-:-:-:1      MOV32I r$_, 1.0;\n", 0..95;
+</CODE>
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+
+    my @cOrder;
+    #my @swirl = ([0,1],[0,0],[2,0],[2,1]);
+    my @swirl = ([2,0],[2,1],[0,1],[0,0]);
+    #my @swirl = ([0,1],[0,0],[1,0],[1,1]);
+    my @xVals = (0,1,64,65);
+    #my @xVals = (0,2,64,66);
+
+    my @yVals = (0,2,64,66);
+
+    foreach my $y (@yVals)
+    {
+        foreach my $x (@xVals)
+        {
+            push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl;
+        }
+        @xVals = reverse @xVals;
+    }
+
+    foreach my $j (0..7)
+    {
+        my $odd  = $j & 1;
+        my $nOdd = !$odd + 0;
+
+		my %%insert;
+
+        #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
+
+        $insert{c62} =
+                "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/;
+            my $ins    = $insert{"c$c"} || '';
+            my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
+            my $yield  = $c == 32 ? 'Y' : '-';
+            my $wait   = '--'; #$c ? '--' : '01';
+
+            $out .= "$wait:-:-:$yield:$stall      FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins";
+        }
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+END_SASS
+
+    close $fh;
+}
+
+__END__
+
+        my %%insert = (
+            c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
+            c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
+            c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
+            c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
+        );
\ No newline at end of file
diff --git a/Assembler/PascalAs/microbench/throughput2.sass b/Assembler/PascalAs/microbench/throughput2.sass
new file mode 100644
index 0000000..3db5130
--- /dev/null
+++ b/Assembler/PascalAs/microbench/throughput2.sass
@@ -0,0 +1,47 @@
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+    0-10 : result, r1, r2, r3
+    20-27 ~ count, stop
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, 102400;
+--:-:-:-:1      MOV32I r1, 1.0;
+--:-:-:-:1      MOV32I r2, 1.0;
+--:-:-:-:4      MOV32I r3, 1.0;
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0 .. 511)
+    {
+        my $yield = ($i + 32) & 63 ? '-' : 'Y';
+
+        my $stall = $i == 511 ? 0 : 1;
+
+        #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
+
+        #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
+        #$out .= "--:-:-:-:1      MOV result, RZ;\n";
+
+        $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
+        #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
+        #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
diff --git a/Assembler/PascalAs/microbench/throughput3.pl b/Assembler/PascalAs/microbench/throughput3.pl
new file mode 100755
index 0000000..ff9077a
--- /dev/null
+++ b/Assembler/PascalAs/microbench/throughput3.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/perl
+use strict;
+
+my %data;
+
+foreach my $thread128 (1 .. 8)
+{
+    foreach my $size64 (8 .. 16)
+    {
+        my $loopSize  = $size64 * 64;
+        my $loops     = int(2 * 1638400 / ($size64 * $thread128));
+
+        my $blocks    = 16;
+        my $threads   = $thread128 * 128;
+        my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
+        my $fileName  = 'throughput2.sass';
+
+        #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops;
+        #next;
+
+        writeSassFile($fileName, $loopSize, $loops);
+
+        `maxas.pl -i $fileName microbench.cubin`;
+
+        exit if $?;
+
+        my $data = `Release\\microbench.exe e $blocks $threads $fops`;
+
+        my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+        printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
+
+        push @{$data{$loopSize}}, $gflops;
+    }
+}
+print join("\t", 'size', 1 .. 8), "\n";
+foreach my $loopSize (sort {$a <=> $b} keys %data)
+{
+    print join("\t", $loopSize, @{$data{$loopSize}}), "\n";
+}
+
+exit;
+
+sub writeSassFile
+{
+    my ($filename, $loopSize, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'EOF', $loops, $loopSize, $loopSize;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+    0-10 : result, r1, r2, r3, count, stop
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+--:-:-:-:1      MOV32I r1, 1.0;
+--:-:-:-:1      MOV32I r2, 1.0;
+--:-:-:-:4      MOV32I r3, 1.0;
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0 .. %d)
+    {
+        my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y';
+
+        $out .= "--:-:-:$y:1      FFMA result, r1, r2, r3;\n";
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+EOF
+
+    close $fh;
+}
+
+__END__
+
diff --git a/Assembler/PascalAs/microbench/throughput4.pl b/Assembler/PascalAs/microbench/throughput4.pl
new file mode 100755
index 0000000..8f8760c
--- /dev/null
+++ b/Assembler/PascalAs/microbench/throughput4.pl
@@ -0,0 +1,120 @@
+#!/usr/bin/perl
+use strict;
+
+my $loopSize  = 512;
+my $blocks    = 64;
+my $loops     = 102400;
+my $fileName  = 'throughput2.sass';
+
+writeSassFile($fileName, $loops);
+
+#print `maxas.pl -p $fileName`;
+#exit;
+
+print `maxas.pl -i $fileName microbench.cubin`;
+exit if $?;
+
+foreach my $thread128 (4)
+{
+    my $threads   = $thread128 * 128;
+    my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
+
+    print "./microbench e $blocks $threads $fops\n\n";
+    my $data = `./microbench e $blocks $threads $fops`;
+    exit($?) if $?;
+
+    my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+    printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0;
+}
+
+exit;
+
+sub writeSassFile
+{
+    my ($filename, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'EOF', $loops;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+    0-10 : result, r1, r2, r3
+    20-27 ~ count, stop
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+--:-:-:-:1      MOV32I r1, 1.0;
+--:-:-:-:1      MOV32I r2, 1.0;
+--:-:-:-:4      MOV32I r3, 1.0;
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    foreach my $i (0 .. 511)
+    {
+        my $yield = ($i + 32) & 63 ? '-' : 'Y';
+
+        my $stall = $i == 511 ? 0 : 1;
+
+        #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
+        #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
+
+        #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
+        #$out .= "--:-:-:-:1      MOV result, RZ;\n";
+
+        $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
+        #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
+        #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+EOF
+
+    close $fh;
+}
+
+__END__
+
+VMAD.U8.U8
+
+dddd 2655 / 4968 = 53.4%
+1d1d 4594 / 4968 = 92.4%
+11d  4746 / 4968 = 95.5%
+111d 4841 / 4968 = 97.4%
+
+block context switches are a little more expensive than thread context switches
+
+stall codes:
+
+f : 13 clocks
+e :  8 clocks
+d :  6 clocks
+c :  8 clocks, no yield
+b : 11 clocks
+a : 10 clocks
+9 :  9 clocks
+8 :  8 clocks
+7 :  7 clocks
+6 :  6 clocks
+5 :  5 clocks
+4 :  4 clocks
+3 :  3 clocks
+2 :  2 clocks
+1 :  1 clocks,  no yield
+0 :  0 clocks,  no yield, dual issue
\ No newline at end of file
diff --git a/Assembler/PascalAs/microbench/throughput5.pl b/Assembler/PascalAs/microbench/throughput5.pl
new file mode 100755
index 0000000..f9bda8e
--- /dev/null
+++ b/Assembler/PascalAs/microbench/throughput5.pl
@@ -0,0 +1,164 @@
+#!/usr/bin/perl
+use strict;
+my %p;
+
+$p{N}         = 8192;
+$p{blocking}  = 8;
+$p{unroll}    = 8;
+$p{threads}   = 64;   #256
+
+$p{csize}     = $p{blocking} * $p{blocking};
+$p{loopSize}  = $p{unroll} * $p{csize};
+$p{width}     = sqrt($p{csize} * $p{threads});
+$p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
+$p{loops}     = $p{N} / $p{unroll};
+$p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
+
+my $fileName  = 'throughput2.sass';
+
+my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
+
+#print join("\t", @params), "\n";
+#print join("\t", @p{@params}), "\n";
+
+print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
+
+writeSassFile($fileName, $p{loopSize}, $p{loops});
+
+#print `maxas.pl -p $fileName`;
+#exit;
+
+print `maxas.pl -i $fileName microbench.cubin`;
+
+exit if $?;
+
+my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
+
+my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
+
+print $data;
+
+#printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
+
+
+
+
+sub writeSassFile
+{
+    my ($filename, $loopSize, $loops) = @_;
+
+    open my $fh, ">$filename" or die "$filename: $!";
+
+    printf $fh <<'END_SASS', $loops;
+# Kernel: microbench
+
+<REGISTER_MAPPING>
+
+     1, 9, 2,10,17,25,18,26 : cy0x<0-7>
+     5,13, 6,14,21,29,22,30 : cy1x<0-7>
+     3,11, 0, 8,19,27,16,24 : cy2x<0-7>
+     7,15, 4,12,23,31,20,28 : cy3x<0-7>
+    35,43,32,40,51,59,48,56 : cy4x<0-7>
+    39,47,36,44,55,63,52,60 : cy5x<0-7>
+    33,41,34,42,49,57,50,58 : cy6x<0-7>
+    37,45,38,46,53,61,54,62 : cy7x<0-7>
+
+    64-71   : j0Ax<0-3>, j0By<0-3>
+    72-79   : j1Ax<0-3>, j1By<0-3>
+
+    0-79 : r<0-79>
+
+    100-101 : count, stop
+
+    //102-112 ~ readAs, readBs, writeS
+
+</REGISTER_MAPPING>
+
+--:-:-:-:1      MOV count, RZ;
+--:-:-:-:1      MOV32I stop, %d;
+//--:-:-:-:1      MOV writeS, RZ;
+//--:-:-:-:1      MOV readAs, RZ;
+//--:-:-:-:1      MOV readBs, RZ;
+
+<CODE>
+    return join '', map "--:-:-:-:1      MOV r$_, RZ;\n", 0..63;
+</CODE>
+
+<CODE>
+    return join '', map "--:-:-:-:1      MOV32I r$_, 0x00010001;\n", 64..79;
+</CODE>
+
+LOOP:
+
+--:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
+--:-:-:-:1      IADD count, count, 1;
+
+<CODE>
+    my $out;
+
+    my @swirl1 = ([0,0],[0,4],[4,4],[4,0]);
+    my @swirl2 = ([0,0],[1,0],[1,1],[0,1]);
+    my @swirl3 = ([0,2],[2,2],[2,0],[0,0]);
+
+    my @cOrder;
+    foreach my $s1 (@swirl1)
+    {
+        foreach my $s2 (@swirl2)
+        {
+            foreach my $s3 (@swirl3)
+            {
+                push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]];
+            }
+        }
+    }
+
+    foreach my $j (0..7)
+    {
+        my $odd  = $j & 1;
+        my $nOdd = !$odd + 0;
+
+        my %%insert;
+
+        #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
+
+        $insert{c62} =
+                "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
+                "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+            my $ins    = $insert{"c$c"} || '';
+            my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
+            my $yield  = $c == 32 ? 'Y' : '-';
+            my $wait   = '--'; #$c ? '--' : '01';
+
+            my $xReg  = $x >> 1;
+            my $yReg  = $y >> 1;
+            my $xPart = $x & 1 ? '.H1' : '';
+            my $yPart = $y & 1 ? '.H1' : '';
+
+            $out .= sprintf "$wait:-:-:$yield:$stall      XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x,  $odd,$xReg,$xPart,  $odd,$yReg,$yPart,  $y,$x,  $ins;
+        }
+    }
+    return $out;
+</CODE>
+
+--:-:-:Y:5  @P0 BRA LOOP;
+--:-:-:-:5      EXIT;
+END_SASS
+
+    close $fh;
+}
+
+__END__
+
+        my %%insert = (
+            c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
+            c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
+            c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
+            c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
+        );
\ No newline at end of file
diff --git a/Assembler/PascalAs/microbench/xmad.pl b/Assembler/PascalAs/microbench/xmad.pl
new file mode 100755
index 0000000..6aadb89
--- /dev/null
+++ b/Assembler/PascalAs/microbench/xmad.pl
@@ -0,0 +1,12 @@
+#!/usr/bin/perl
+use strict;
+
+print `maxas.pl -i xmad2.sass microbench.cubin`;
+
+exit if $?;
+
+print `./microbench i 1 128`;
+
+
+__END__
+
diff --git a/Assembler/PascalAs/microbench/xmad2.sass b/Assembler/PascalAs/microbench/xmad2.sass
new file mode 100644
index 0000000..f0ce936
--- /dev/null
+++ b/Assembler/PascalAs/microbench/xmad2.sass
@@ -0,0 +1,144 @@
+# Kernel: microbench
+# InsCnt: 18
+# RegCnt: 5
+# SharedSize: 4096
+# BarCnt: 1
+# Params(3):
+#	ord:addr:size:align
+#	0:0x140:8:0
+#	1:0x148:8:0
+#	2:0x150:8:0
+#
+# Instructions:
+
+<CONSTANT_MAPPING>
+    blockDimX : c[0x0][0x8]
+    blockDimY : c[0x0][0xc]
+    blockDimZ : c[0x0][0x10]
+    gridDimX : c[0x0][0x14]
+    gridDimY : c[0x0][0x18]
+    gridDimZ : c[0x0][0x1c]
+
+    param_out[0] : c[0x0][0x140]
+    param_out[1] : c[0x0][0x144]
+    param_clocks[0] : c[0x0][0x148]
+    param_clocks[1] : c[0x0][0x14c]
+    param_in[0] : c[0x0][0x150]
+    param_in[1] : c[0x0][0x154]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+	0-1 : out<0-1>
+	2-3 : clocks<0-1>
+    4-15  : result, result2, tid, bid, blockDim, clock1, clock2, scale, s
+    16-24 : a, b, c, x
+
+</REGISTER_MAPPING>
+
+// Load in our params
+--:-:-:-:1      MOV out0,      param_out[0];
+--:-:-:-:1      MOV out1,      param_out[1];
+--:-:-:-:1      MOV clocks0,   param_clocks[0];
+--:-:-:-:1      MOV clocks1,   param_clocks[1];
+//--:-:-:-:1      MOV in,       c[0x0][0x148];
+--:-:-:-:1      MOV blockDim, blockDimX;
+
+--:-:-:-:1      PSETP.AND.AND P0, PT, !PT, PT, PT;
+
+--:-:-:-:6      MOV32I result,  0xffffffff;
+--:-:-:-:6      MOV32I result2, 0x0;
+--:-:-:-:1      MOV32I a, 1;
+--:-:-:-:1      MOV32I b, 1;
+--:-:-:-:6      MOV32I c, 0x0;
+
+// (127 - scale) << 23
+//--:-:-:-:6      MOV32I scale, 28;
+//--:-:-:-:6      IADD scale, -scale, 127;
+//--:-:-:-:6      SHL  scale, scale, 23;
+
+
+//--:-:-:-:6      MOV32I c, 0x4f765432;
+
+//--:-:1:-:2      LDG.CI.128 a, [in];
+
+//01:-:-:-:6      VMAD.S16.S16 result, a, b, c;
+
+//--:-:-:-:6      MOV result, a;
+
+// a >> 16 | (b & 0xffff0000)
+
+//--:-:-:-:6      SHR.U32 result, a, 16;
+//--:-:-:-:6      LOP3.LUT result, result, b, c, 0xf8;
+
+//--:-:-:-:6      I2I.S32.S16 result, a.H1;
+
+//--:-:-:Y:d      IADD result.CC, a, -c;
+//--:-:-:Y:2      IADD.X result2, b, -RZ;
+
+//--:-:-:-:6      SHR result, a, 1;
+
+//--:-:-:-:6      BFI result, b, 0x1010, a;
+
+--:-:-:-:1      CS2R clock1, SR_CLOCKLO;
+
+//--:-:-:-:6      XMAD.S16.S16 c, a, b, RZ;
+//--:-:-:-:6      ISET.LT.AND s, c, RZ, PT;
+//--:-:-:-:6      IADD result.CC, c, result;
+//--:-:-:-:6      IADD.X result2, s, result2;
+
+//--:-:-:-:6      XMAD.S16.S16 result.CC, a, b, result;
+//--:-:-:-:6      IADD.X result2, result2, RZ;
+
+//--:-:-:-:6      SHF.R.S64 result, result, 1, result2;
+//--:-:-:-:6      MOV32I result2, 0;
+
+--:-:-:-:f      LOP.AND.NZ P0, RZ, result, 1;
+
+--:-:-:-:6  @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result;
+
+//--:-:1:-:d      I2F.F32.S32 result2, a;
+//01:-:-:-:6      FMUL result2, result2, scale;
+//01:-:2:-:d      F2I.S32.F32 result, result2;
+
+02:-:-:-:6      CS2R clock2, SR_CLOCKLO;
+
+//F2I   = "^$pred?F2I$ftz$x2x$round $r0, $cr20;"
+//I2F   = "^$pred?I2F$x2x$rnd $r0, $cr20;"
+//x2x   = "\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)"
+//rnd   = "(?:\.(?<rnd>RN|RM|RP|RZ))?"
+//round = "(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?"
+//r8    = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B1|B2|B3))?(?<reuse1>\.reuse)?"
+//r20   = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B1|B2|B3))?(?<reuse2>\.reuse)?"
+
+
+//--:-:-:-:1      XMAD.MRG x, a, b.H1, RZ;
+//--:-:-:-:6      XMAD result, a.H1, b.H1, c;
+//--:-:-:-:1      XMAD.PSL.CBCC result, a.H1, x.H1, result;
+
+// Get the first clock value
+
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:2:-:2      S2R bid, SR_CTAID.X;
+
+
+
+// Take the difference of clocks
+--:-:-:-:1      IADD clock1, clock2, -clock1;
+
+// Setup our output addresses
+// Stall your pipeline dependencies properly
+03:-:-:-:1      XMAD tid, blockDim, bid, tid;
+--:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
+--:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
+--:-:-:Y:6      SHL  tid, tid, 0x2;
+
+--:-:-:-:1      IADD clocks, clocks, tid;
+--:-:-:-:1      IADD out,  out,  tid;
+
+// Output the results.
+// No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
+--:-:-:-:1      STG.E [clocks], result2;
+--:-:-:-:1      STG.E [out],    result;
+--:-:-:-:5      EXIT;
+
diff --git a/Assembler/PascalAs/pm_to_blib b/Assembler/PascalAs/pm_to_blib
new file mode 100644
index 0000000..e69de29
diff --git a/Assembler/PascalAs/sgemm/batched_gemm.xlsx b/Assembler/PascalAs/sgemm/batched_gemm.xlsx
new file mode 100644
index 0000000..c88f0a7
Binary files /dev/null and b/Assembler/PascalAs/sgemm/batched_gemm.xlsx differ
diff --git a/Assembler/PascalAs/sgemm/cublas_sgemm.ptx b/Assembler/PascalAs/sgemm/cublas_sgemm.ptx
new file mode 100644
index 0000000..8edec86
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/cublas_sgemm.ptx
@@ -0,0 +1,65 @@
+.version 4.1
+.target sm_50
+.address_size 64
+
+// ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx
+
+// You can use maxas to insert cublas_device.lib code into a cubin built from this ptx:
+
+// From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib
+
+// cuobjdump -lelf cublas_device.lib | find "sm_50"
+
+// cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib
+
+// maxas -l maxwell_sgemm.asm.sm_50.cubin
+
+// maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass
+// maxas -e -k maxwell_sgemm_128x64_nt  maxwell_sgemm_128x64_nt.sass
+
+// maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin
+// maxas -i maxwell_sgemm_128x64_nt.sass  cublas_sgemm.cubin
+
+// The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas.
+
+.visible .entry maxwell_sgemm_128x128_nt(
+	.param .u64 .ptr.global.align 8 param_A,
+	.param .u64 .ptr.global.align 8 param_B,
+	.param .u64 .ptr.global.align 8 param_C,
+	.param .s32 param_lda,
+	.param .s32 param_ldb,
+	.param .s32 param_ldc,
+	.param .s32 param_k,
+	.param .u64 .ptr.global.align 8 param_Alpha,
+	.param .u64 .ptr.global.align 8 param_Beta,
+	.param .s32 param_alpha,
+	.param .s32 param_beta,
+	.param .s32 param_flag
+)
+.reqntid 256
+{
+	.shared .align 16 .b8 share[16384];
+
+	ret;
+}
+
+.visible .entry maxwell_sgemm_128x64_nt(
+	.param .u64 .ptr.global.align 8 param_A,
+	.param .u64 .ptr.global.align 8 param_B,
+	.param .u64 .ptr.global.align 8 param_C,
+	.param .s32 param_lda,
+	.param .s32 param_ldb,
+	.param .s32 param_ldc,
+	.param .s32 param_k,
+	.param .u64 .ptr.global.align 8 param_Alpha,
+	.param .u64 .ptr.global.align 8 param_Beta,
+	.param .s32 param_alpha,
+	.param .s32 param_beta,
+	.param .s32 param_flag
+)
+.reqntid 128
+{
+	.shared .align 16 .b8 share[12288];
+
+	ret;
+}
diff --git a/Assembler/PascalAs/sgemm/new.cubin b/Assembler/PascalAs/sgemm/new.cubin
new file mode 100644
index 0000000..6a1572b
Binary files /dev/null and b/Assembler/PascalAs/sgemm/new.cubin differ
diff --git a/Assembler/PascalAs/sgemm/sgemm.cpp b/Assembler/PascalAs/sgemm/sgemm.cpp
new file mode 100644
index 0000000..f2127d8
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm.cpp
@@ -0,0 +1,480 @@
+// sgemm.cpp : Defines the entry point for the console application.
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+
+CUcontext      hContext = 0;
+cublasHandle_t hCublas  = 0;
+
+float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat = 1, int printVars = 0);
+float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat);
+void gflops(const char* ident, int N, float ms, int repeat);
+void test(float* C, float* T, int N, size_t size);
+
+#define REPEAT_BLOCK 2000
+
+#define CUDA_CHECK( fn ) do { \
+		CUresult status = (fn); \
+		if ( CUDA_SUCCESS != status ) { \
+			const char* errstr; \
+			cuGetErrorString(status, &errstr); \
+			printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \
+			if (hCublas)  cublasDestroy(hCublas); \
+			if (hContext) cuCtxDestroy(hContext); \
+			exit(EXIT_FAILURE); \
+		} \
+	} while (0)
+
+#define CUBLAS_CHECK( fn ) do { \
+		cublasStatus_t status = (fn); \
+		if ( CUBLAS_STATUS_SUCCESS != status ) { \
+			printf("Cublas Failure (line %d of file %s):\n\t%s returned %d\n", __LINE__, __FILE__, #fn, status); \
+			if (hCublas)  cublasDestroy(hCublas); \
+			if (hContext) cuCtxDestroy(hContext); \
+			exit(EXIT_FAILURE); \
+		} \
+	} while (0)
+
+int main(int argc, char* argv[])
+{
+	char deviceName[32];
+	int count, ordinal, major, minor;
+	CUdevice  hDevice;
+	CUevent hStart, hStop;
+	CUdeviceptr devA, devB, devC, devT, otherDevA, otherDevB;
+
+	// Initialize the Driver API and find a device
+	CUDA_CHECK( cuInit(0) );
+	CUDA_CHECK( cuDeviceGetCount(&count) );
+	for (ordinal = 0; ordinal < count; ordinal++)
+	{
+		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
+		CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) );
+		CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) );
+		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
+		if (major >= 5 && minor >= 2)
+		{
+			//printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
+			break;
+		}
+	}
+	if (ordinal == count)
+	{
+		printf("No compute 5.0 device found, exiting.\n");
+		exit(EXIT_FAILURE);
+	}
+
+	// First command line arg is the size of N divided by 128
+	int thread128 = 64;
+	if (argc > 1)
+		thread128 = atoi(argv[1]);
+	if (thread128 > 64 || thread128 < 1)
+		thread128 = 64;
+
+	// Second command line arg is the repeat count for benchmarking
+	int repeat = 1;
+	if (argc > 2)
+		repeat = atoi(argv[2]);
+	if (repeat > 10000 || repeat < 1)
+		repeat = 1;
+
+	// Third command line arg is the normalized float size
+	CUarray_format format = CU_AD_FORMAT_FLOAT;
+	if (argc > 3)
+		format = (CUarray_format)atoi(argv[3]);
+	if (format != CU_AD_FORMAT_FLOAT && format != CU_AD_FORMAT_UNSIGNED_INT16 && format != CU_AD_FORMAT_UNSIGNED_INT8)
+		format = CU_AD_FORMAT_FLOAT;
+
+	// Forth command line arg is for printf debugging 
+	int printVars = 0;
+	if (argc > 4)
+		printVars = atoi(argv[4]);
+	if (printVars > 100 || printVars < 1)
+		printVars = 0;
+
+	int N = thread128 * 128;
+	float alpha = 1, beta = 0, ms = 1;
+	size_t sizeOther = N * N;
+	size_t sizeFloat = sizeOther * 4;
+
+	float* A = (float*)malloc(sizeFloat);
+	float* B = (float*)malloc(sizeFloat);
+	float* C = (float*)malloc(sizeFloat);
+	float* T = (float*)malloc(sizeFloat);  
+	float *otherA, *otherB; 
+
+	//int counter = 0;
+	//srand((unsigned int)time(0));
+	for(int i = 0; i < N * N; i++) //
+	{
+		//A[i] = (float)rand() / (float)RAND_MAX;
+		//B[i] = (float)rand() / (float)RAND_MAX;
+		A[i] = B[i] = 1.0f; // * (i & 3) + 1.0f;
+		//A[i] = 1.0f;
+		//B[i * N + counter++] = 1.0f; // identity matrix
+	}
+
+	if (format == CU_AD_FORMAT_FLOAT)
+	{
+		sizeOther *= 4;
+		otherA = A;
+		otherB = B;
+	}
+	else if (format == CU_AD_FORMAT_UNSIGNED_INT16)
+	{
+		sizeOther *= 2;
+		unsigned short* othera = (unsigned short*)malloc(sizeOther);
+		unsigned short* otherb = (unsigned short*)malloc(sizeOther);
+		for(int i = 0; i < N * N; i++)
+			othera[i] = otherb[i] = 65535;
+
+		otherA = reinterpret_cast<float*>(othera);
+		otherB = reinterpret_cast<float*>(otherb);
+	}
+	else // (format == CU_AD_FORMAT_UNSIGNED_INT8)
+	{
+		otherA = (float*)malloc(sizeOther);
+		otherB = (float*)malloc(sizeOther);
+		memset(otherA, 255, sizeOther);
+		memset(otherB, 255, sizeOther); 
+	}
+
+	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
+	//CUBLAS_CHECK( cublasCreate(&hCublas) );
+	
+	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); // CU_EVENT_DEFAULT 
+	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );
+
+	CUDA_CHECK( cuMemAlloc(&devA, sizeFloat) );
+	CUDA_CHECK( cuMemAlloc(&devB, sizeFloat) );
+	CUDA_CHECK( cuMemAlloc(&devC, sizeFloat) );
+	CUDA_CHECK( cuMemAlloc(&devT, sizeFloat) );
+	
+	CUDA_CHECK( cuMemcpyHtoD(devA, A, sizeFloat) );
+	CUDA_CHECK( cuMemcpyHtoD(devB, B, sizeFloat) );
+	CUDA_CHECK( cuMemsetD8(devC, 0, sizeFloat) );
+	CUDA_CHECK( cuMemsetD8(devT, 0, sizeFloat) );
+
+	if (format == CU_AD_FORMAT_FLOAT)
+	{
+		otherDevA = devA;
+		otherDevB = devB;
+	}
+	else
+	{
+		CUDA_CHECK( cuMemAlloc(&otherDevA, sizeOther) );
+		CUDA_CHECK( cuMemAlloc(&otherDevB, sizeOther) );
+		CUDA_CHECK( cuMemcpyHtoD(otherDevA, otherA, sizeOther) );
+		CUDA_CHECK( cuMemcpyHtoD(otherDevB, otherB, sizeOther) );
+	}
+
+	// Warm up the clock (unless under nsight)
+	//if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER 
+	//	for (int i = 0; i < 3; i++)
+	//		CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast<float*>(devA), N, reinterpret_cast<float*>(devB), N, &beta, reinterpret_cast<float*>(devT), N) );
+
+	// Launch our kernel
+	ms = assemblySgemm("sgemm_kernel_64", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars);
+	gflops("Max64 ", N, ms, repeat);
+
+	ms = assemblySgemm("sgemm_kernel_128", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars);
+	gflops("Max128", N, ms, repeat);
+
+	//ms = cublasSgemm("maxwell_sgemm_128x64_nt", devT, devA, devB, N, hStart, hStop, repeat);
+	//gflops("Cub64 ", N, ms, repeat);
+
+	//ms = cublasSgemm("maxwell_sgemm_128x128_nt", devT, devA, devB, N, hStart, hStop, repeat);
+	//gflops("Cub128", N, ms, repeat);
+
+	// Run cublas again for the same repeat count for comparison
+	//CUDA_CHECK( cuEventRecord(hStart, NULL) );
+	//for (int i = 0; i < repeat; i++)
+	//	CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast<float*>(devA), N, reinterpret_cast<float*>(devB), N, &beta, reinterpret_cast<float*>(devT), N) );
+	//CUDA_CHECK( cuEventRecord(hStop, NULL) );
+	//CUDA_CHECK( cuEventSynchronize(hStop) );
+	//CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
+	//gflops("Cublas", N, ms, repeat);
+
+	// Get back our results from each kernel
+	CUDA_CHECK( cuMemcpyDtoH(C, devC, sizeFloat) );
+	CUDA_CHECK( cuMemcpyDtoH(T, devT, sizeFloat) );
+	
+	// Cleanup and shutdown of cuda
+	CUDA_CHECK( cuMemFree(devA) );
+	CUDA_CHECK( cuMemFree(devB) );
+	CUDA_CHECK( cuMemFree(devC) );
+	CUDA_CHECK( cuMemFree(devT) );
+	if (format != CU_AD_FORMAT_FLOAT)
+	{
+		CUDA_CHECK( cuMemFree(otherDevA) );
+		CUDA_CHECK( cuMemFree(otherDevB) );
+	}
+
+	CUDA_CHECK( cuEventDestroy(hStart) );
+	CUDA_CHECK( cuEventDestroy(hStop) );
+
+	//CUBLAS_CHECK( cublasDestroy(hCublas) );
+	//hCublas  = 0;
+	CUDA_CHECK( cuCtxDestroy(hContext) );
+	hContext = 0;
+
+	// compare C and T for accuracy
+	test(C, T, N, sizeFloat);
+
+	// And free up host memory
+	free(A); free(B); free(C); free(T);
+
+	if (format != CU_AD_FORMAT_FLOAT)
+	{
+		free(otherA); 
+		free(otherB);
+	}
+
+	return 0;
+}
+
+// Our kernel wrapper function
+float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat, int printVars)
+{
+	// Configure our x and y grid dimensions (assume nice square matrixes).
+	// Each block gets 128 tracks from A and 128 tracks from B.
+	// Each of the 256 threads calculates 64 elements of that 128x128 sub matrix of C.
+	// See Figure 2 here to get the gist of things (we use a different mapping to maximize LDS.128 usage):
+	// http://icl.cs.utk.edu/projectsfiles/magma/pubs/fermi_gemm.pdf
+
+	int threads, width;
+	if (strcmp(kernel, "sgemm_kernel_64") == 0)
+	{
+		threads = 64;
+		width   = 64;
+	}
+	else
+	{
+		threads = 256;
+		width   = 128;
+	}
+
+	int gridDimXY = N / width + (N % width != 0);
+	int blocks    = gridDimXY * gridDimXY;
+
+	// Setup out debug printf output buffer
+	CUdeviceptr devD = NULL; 
+	int* D = NULL;
+	int  sizeD = 0;
+
+	if (printVars)
+	{
+		sizeD = blocks * threads * printVars * sizeof(int);
+		D = (int*)malloc(sizeD);
+
+		CUDA_CHECK( cuMemAlloc(&devD, sizeD) );
+		CUDA_CHECK( cuMemsetD8(devD, 0, sizeD) );
+	}
+
+	// Load the cubin
+	CUmodule hModule;
+	CUDA_CHECK( cuModuleLoad(&hModule, "sgemm.cubin") );
+
+	// Load the textures
+	CUtexref texA, texB;
+	CUDA_CHECK( cuModuleGetTexRef(&texA, hModule, "texA") );
+	CUDA_CHECK( cuModuleGetTexRef(&texB, hModule, "texB") );
+
+	// Configure the textures
+	CUDA_CHECK( cuTexRefSetFormat(texA, format, 4) );
+	CUDA_CHECK( cuTexRefSetFormat(texB, format, 4) );
+
+	CUDA_CHECK( cuTexRefSetAddress(NULL, texA, devA, size) );
+	CUDA_CHECK( cuTexRefSetAddress(NULL, texB, devB, size) );
+
+	// Load the kernel function
+	CUfunction hKernel;
+	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) );
+
+	// Setup the params
+	float alpha = 1.0f;
+	void* params[] = { &devC, &N, &N, &N, &N, &N, &N, &alpha, &devD };
+
+	float totalTime = 0;
+	// Launch the kernel repeat times.. but break it up into pieces so as not to lock things up.
+	while (repeat > 0)
+	{
+		float ms;
+		int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat;
+		CUDA_CHECK( cuEventRecord( hStart, NULL ) );
+		
+		for (int i = 0; i < r; i++)
+			CUDA_CHECK( cuLaunchKernel(hKernel, gridDimXY, gridDimXY, 1, threads, 1, 1, 0, 0, params, 0) );
+		
+		CUDA_CHECK( cuEventRecord( hStop, NULL ) );
+		CUDA_CHECK( cuEventSynchronize( hStop ) );
+		CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) );
+		totalTime += ms;
+		repeat -= r;
+	}
+
+
+	CUDA_CHECK( cuModuleUnload(hModule) );
+
+	// And here we print out the debug info if requested:
+	if (printVars)
+	{
+		CUDA_CHECK( cuMemcpyDtoH(D, devD, sizeD) );
+		CUDA_CHECK( cuMemFree(devD) );
+		int   *iD = D;
+		float *fD = reinterpret_cast<float*>(D);
+		unsigned int *uD = reinterpret_cast<unsigned int*>(D);
+
+		for (int by = 0; by < gridDimXY; by++)
+		{
+			for (int bx = 0; bx < gridDimXY; bx++)
+			{
+				unsigned int clock = 0xffffffff, sm = 0;
+
+				for (int tid = 0; tid < threads; tid++)
+				{
+					//printf("by: %3d, bx: %3d, tid:%3d, rA:%5d, rB:%5d, wr:%5d, rd:%5d, cx:%5d, cy:%5d, ci:%5d, c:%.2f\n", 
+					//printf("by: %3d, bx: %3d, tid:%3d, t0:%5d, end:%5d, k:%5d, tid2:%5d, tid15:%5d, ldx:%5d, t2:%5d, t4:%5d\n", 
+					//	    by,      bx,      tid,     iD[0],  iD[1],   iD[2], iD[3],    iD[4],     iD[5],   iD[6],  iD[7]
+					//);
+					if (uD[1] < clock) clock = uD[1];
+					sm = uD[0];
+
+					iD += printVars;
+					fD += printVars;
+					uD += printVars;
+				}
+				printf("%02d %08u %d %d\n", sm, clock, by, bx);
+			}
+		}
+		free(D);
+	}
+
+	return totalTime;
+}
+
+typedef struct dPointer
+{
+	CUdeviceptr lo;
+	CUdeviceptr hi;
+} dPointer;
+
+float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat)
+{
+	int threads, gridX, gridY;
+	if (strcmp(kernel, "maxwell_sgemm_128x64_nt") == 0)
+	{
+		threads = 128;
+		gridX = N / 128 + (N % 128 != 0);
+		gridY = N / 64  + (N % 64  != 0);
+	}
+	else
+	{
+		threads = 256;
+		gridX = gridY = N / 128 + (N % 128 != 0);
+	}
+	int blocks = gridX * gridY;
+
+	// Load the cubin
+	// See cublas_sgemm.ptx for info on how to build this.
+	CUmodule hModule;
+	CUDA_CHECK( cuModuleLoad(&hModule, "cublas_sgemm.cubin") );
+
+	// Load the kernel function
+	CUfunction hKernel;
+	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) );
+
+	// Setup the params
+	// I should probably be working in 64 bits...
+	dPointer dA = { devA, 0 };
+	dPointer dB = { devB, 0 };
+	dPointer dC = { devC, 0 };
+
+	int   flag  = 0;
+	float alpha = 1.0;
+	float beta  = 0.0;
+	
+	void* params[] = { &dA, &dB, &dC, &N, &N, &N, &N, &dA, &dA, &alpha, &beta, &flag };
+
+	float totalTime = 0;
+	// Launch the kernel repeat times.. but break it up into pieces so as not to lock things up.
+	while (repeat > 0)
+	{
+		float ms;
+		int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat;
+		CUDA_CHECK( cuEventRecord( hStart, NULL ) );
+		
+		for (int i = 0; i < r; i++)
+			CUDA_CHECK( cuLaunchKernel(hKernel, gridX, gridY, 1, threads, 1, 1, 0, 0, params, 0) );
+		
+		CUDA_CHECK( cuEventRecord( hStop, NULL ) );
+		CUDA_CHECK( cuEventSynchronize( hStop ) );
+		CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) );
+		totalTime += ms;
+		repeat -= r;
+	}
+
+
+	CUDA_CHECK( cuModuleUnload(hModule) );
+
+	return totalTime;
+}
+
+void gflops(const char* ident, int N, float ms, int repeat)
+{
+	// Standard sgemm flops formula
+	ms /= repeat;
+	printf("%s GFLOPS: %.2f (size: %d, iterations: %d)\n", ident, ((double)N * N * N * 2.0 + N * N) / (ms * 1000000.0), N, repeat);
+}
+
+void test(float* C, float* T, int N, size_t size)
+{
+	// Compare our implementation with the cublas result
+	int errors = memcmp(C, T, size);
+	if (errors)
+	{
+		if (N <= 512) // This gets too big and slow for large N
+		{
+			errors = 0;
+			FILE* file;
+			if (fopen_s(&file, "data.txt", "w") == 0)
+			{
+				for (int y = 0; y < N; ++y)
+				{
+					for (int x = 0; x < N; ++x)
+					{
+						float c = C[x*N + y];
+						float t = T[x*N + y];
+						if (c != t)
+						{
+							errors++;
+							fprintf(file, "%.8f!%.8f\t", c , t);
+							//fprintf(file, "%.0f!", c);
+							//fprintf(file, "!");
+						}
+						else
+						{
+							//fprintf(file, "%.0f=%.0f\t", c , t);
+							//fprintf(file, "%.0f=", c);
+							fprintf(file, "=");
+						}
+					}
+					fprintf(file, "\n");
+				}
+				fclose(file);
+				printf("%d errors\n", errors);
+			}
+			else
+				{ printf("Cannot open data.txt for writing\n"); }
+		}
+		else
+			{ printf("%d errors\n", errors); }
+	}
+	else
+		{ printf("%d errors\n", errors); }
+}
\ No newline at end of file
diff --git a/Assembler/PascalAs/sgemm/sgemm.cu b/Assembler/PascalAs/sgemm/sgemm.cu
new file mode 100644
index 0000000..ce8b2a6
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm.cu
@@ -0,0 +1,105 @@
+
+// Note this file isn't configured to automatically compile.
+// Here's how:
+
+// If you want to look at the ptx first:
+// nvcc -arch sm_50 -m 32 -ptx sgemm.cu
+
+// Manually compile your kernel to a cubin.
+// You should only have to do this once, unless you change params or shared size or globals:
+// nvcc -arch sm_50 -m 32 -cubin sgemm.cu
+
+// If tweaking a kernel or writing a new one based on this shell code you would then do this:
+// maxas.pl -e kernel.cubin kernel.sass
+
+// I've already included a modified kernel (sgemm.sass) so the next step is..
+
+// Splice the manually assembled code back into the cubin:
+// maxas.pl -i sgemm.sass sgemm.cubin
+
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#include <cuda_texture_types.h>
+#include <texture_fetch_functions.h>
+
+typedef texture<float4, cudaTextureType1D, cudaReadModeElementType> floatTex;
+
+floatTex  texA(0, cudaFilterModePoint, cudaAddressModeBorder);
+floatTex  texB(0, cudaFilterModePoint, cudaAddressModeBorder);
+
+// Use extern C so C++ doesn't mangle our kernel name
+extern "C"
+// This kernel requires 256x1x1 threads per block
+__global__ void __launch_bounds__(256) sgemm_kernel_128(
+	float *C,
+	const int m,   const int n,   const int k,
+	const int lda, const int ldb, const int ldc,
+	float alpha, int *D)
+{
+	// Declare any shared memory your kernel requires
+	// Or you could just pass the amount in as a param to cuLaunchKernel
+	__shared__ float4 share[1024];
+
+	int tid = threadIdx.x;
+
+	// If you use indirect texture references, they will be passed as params at the end of the param list
+	// So set that up here to make sure they're available in your kernel
+	floatTex tex = tid > 127 ? texB : texA;
+
+	// Make use of shared and your textures so it doesn't get optimized away
+	share[tid] = tex1Dfetch(tex, tid);
+
+	__syncthreads();
+
+	// output something so your setup isn't optimized away.
+	C[tid] = share[255-tid].x;
+}
+
+extern "C"
+__global__ void __launch_bounds__(64) sgemm_kernel_64(
+	float *C,
+	const int m,   const int n,   const int k,
+	const int lda, const int ldb, const int ldc,
+	float alpha, int *D)
+{
+	__shared__ float4 share[512];
+
+	int tid = threadIdx.x;
+
+	floatTex tex = tid > 127 ? texB : texA;
+
+	share[tid] = tex1Dfetch(tex, tid);
+
+	__syncthreads();
+
+	C[tid] = share[255-tid].x;
+}
+
+// A note about using the Cuda Runtime.
+// If that's your preference over the driver API then here's what you'd do:
+
+// In your project properties in the Cuda C/C++ panel:
+//    -Set the "Keep Processed Files" (-keep) option
+//    -Add a -v manually to the command line
+// If compiling on command line just add -keep -v options to nvcc.
+// Rebuild your solution and look in the log for these lines that follow the ptxas step:
+
+// #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
+// #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
+// #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
+
+// You just need to manually run these 3 commands (or add them to a build script)
+// after you've modified the cubin generated from the preceeding ptxas command.
+// That will give you a new .cu.obj file which will automatically be linked in for you next time you
+// build your project (or you could manually run the linker step as well).
+
+// Having done that you can call your kernel normally using the <<< >>> syntax.
+// Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
+// With fatbin you can also keep non-maxwell optimized versions of your code.
+
+
+// I just discovered this also works as a shortcut to the above:
+// nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu
+
+// The cu kernel definitions above need to have empty bodies.
+// And, the cu file must be compiled to a lib seperately before linking.
\ No newline at end of file
diff --git a/Assembler/PascalAs/sgemm/sgemm.cubin b/Assembler/PascalAs/sgemm/sgemm.cubin
new file mode 100644
index 0000000..946c7d7
Binary files /dev/null and b/Assembler/PascalAs/sgemm/sgemm.cubin differ
diff --git a/Assembler/PascalAs/sgemm/sgemm.pl b/Assembler/PascalAs/sgemm/sgemm.pl
new file mode 100644
index 0000000..9b1661b
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm.pl
@@ -0,0 +1,102 @@
+#!/usr/bin/perl
+use strict;
+
+my $CU_AD_FORMAT_UNSIGNED_INT8  = 0x01;
+my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02;
+my $CU_AD_FORMAT_FLOAT          = 0x20;
+
+if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9])
+{
+    print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`;
+    exit if $?;
+    print `maxas.pl -i sgemm128.sass sgemm.cubin`;
+    exit if $?;
+    print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`;
+}
+if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9])
+{
+    print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`;
+    exit if $?;
+    print `maxas.pl -i sgemm64.sass sgemm.cubin`;
+    exit if $?;
+    print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`;
+}
+
+#print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2);
+
+`Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`;
+
+print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`;
+exit;
+
+my %data;
+foreach my $thread128 (4 .. 64)
+{
+    my $N = $thread128 * 128;
+
+    my $iterations = int(20 * (64 * 128)**3 / $N**3);
+    $iterations = 10000 if $iterations > 10000;
+
+    print "$N $iterations\n";
+
+    my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`;
+
+    foreach my $bench (split "\n", $data)
+    {
+        if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /)
+        {
+            push @{$data{$N}}, $2;
+            print "$1 $2\n";
+        }
+    }
+}
+print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n";
+
+foreach my $N (sort { $a <=> $b } keys %data)
+{
+    print join("\t", @{$data{$N}}), "\n";
+}
+
+
+#print $data;
+
+__END__
+
+
+64 * 128 * 16 * 1.620 * .931 / 520
+
+Max64  GFLOPS: 1377.38 (size: 256, iterations: 2000)
+Max128 GFLOPS: 973.70 (size: 256, iterations: 2000)
+Cub64  GFLOPS: 1272.42 (size: 256, iterations: 2000)
+Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000)
+
+my @data = grep /\S/, split "\n", $data;
+
+my $min;
+my %smData;
+my @sdata;
+foreach (@data)
+{
+    next if /GFLOPS/;
+
+    my ($sm, $clock, $by, $bx) = split /\s+/;
+
+    $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm};
+
+    $min = $clock if !$min || $clock < $min;
+
+    push @sdata, [$sm, $clock, $by, $bx];
+}
+
+foreach (@sdata)
+{
+    $_->[1] -= $smData{$_->[0]};
+}
+
+foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata)
+{
+    printf "%02d %8u  by: %2d bx: %2d\n", @$_;
+
+}
+
+
diff --git a/Assembler/PascalAs/sgemm/sgemm.sln b/Assembler/PascalAs/sgemm/sgemm.sln
new file mode 100644
index 0000000..bcbee09
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32
+		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32
+		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32
+		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Assembler/PascalAs/sgemm/sgemm.vcxproj b/Assembler/PascalAs/sgemm/sgemm.vcxproj
new file mode 100644
index 0000000..6d28ced
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm.vcxproj
@@ -0,0 +1,92 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{D571379D-3653-43CB-BE83-A6C68D392A05}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>sgemm</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="sgemm.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="sgemm128.sass" />
+    <None Include="sgemm64.sass" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/Assembler/PascalAs/sgemm/sgemm128.sass b/Assembler/PascalAs/sgemm/sgemm128.sass
new file mode 100644
index 0000000..038d2f3
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm128.sass
@@ -0,0 +1,613 @@
+# Kernel: sgemm_kernel_128
+#
+# SharedSize: 16384
+# Params(8):
+#   0:0x140:4:4 param_C,
+#   1:0x144:4:0 param_m,
+#   2:0x148:4:0 param_n,
+#   3:0x14c:4:0 param_k,
+#   4:0x150:4:0 param_lda,
+#   5:0x154:4:0 param_ldb,
+#   6:0x158:4:0 param_ldc
+#   7:0x15c:4:0 param_alpha
+#   8:0x160:4:4 param_D // for diagnostic printf output
+#
+# Globals:
+#   c[0x0][0x164]: texA (the value is 1)
+#   c[0x0][0x168]: texB (the value is 0)
+
+<REGISTER_MAPPING>
+
+    // Temporary registers to calculate the state registers. Reuse the C output registers.
+    // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts.
+    0-63    ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy
+
+    // Aliases for the C registers we use for initializing C (used as vectors)
+    0-63    : cz<00-63>
+
+    // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers
+    80      : zOffset
+
+    // 64 C maxtrix output registers.
+    // Use special mapping to avoid register bank conflicts between these registers and the blocking registers.
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
+    35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
+    39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
+    33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
+    37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
+
+    // Double buffered register blocking used in vector loads.
+    // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags
+    64-79   : j0Ax<00-03|64-67>, j0By<00-03|64-67>
+    80-95   : j1Ax<00-03|64-67>, j1By<00-03|64-67>
+
+    // Registers to load A or B
+    96-103  : loadX<0-7>
+
+    // Key global state registers for main loop and some we reuse for outputing C.
+    // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of
+    // delayed bank conflicts between memory operations and ffmas.
+    // The array index bracket notation can be used to request a bank in a dynamically allocated range.
+    104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs
+
+    // Registers to store the results back to global memory. Reuse any register not needed after the main loop.
+    // Statically allocate cs0-7 because they're vector registers.
+    64-71   : cs<0-7>
+
+    // dynamically allocated C output registers(~)
+    72-103  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX
+
+</REGISTER_MAPPING>
+
+// Note the absense of the loading of the stack pointer into R1.
+// No idea why ptxas does that anyway when it's not used for register spilling.
+// Such a waste of a perfectly good register.
+
+// Scheduler doesn't handle the dependency flags yet,
+// so move these first instructions outside the block that's auto scheduled
+//--:-:-:-:1      CS2R clock, SR_CLOCKLO;
+//--:-:-:-:1      S2R smId, SR_VIRTID;
+//--:-:-:-:1      S2R nSMs, SR_VIRTCFG;
+--:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
+--:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
+--:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
+
+// Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies
+// Memory dependencies are left up to the auther to deal with manually for now.
+<SCHEDULE_BLOCK>
+
+// First 128 threads load A to shared, 2nd 128 loads B to shared
+// Note this technique is not possible in cuda or ptx as there's no way to
+// efficiently specify a warp-uniform predicate for a memory op.
+// Compile sgemm.cu and inspect the sass to see what I'm talking about.
+
+// blk = tid >= 128 ? by   : bx;
+// ldx = tid >= 128 ? ldb  : lda;
+// tex = tid >= 128 ? texB : texA;
+01:-:-:Y:1      ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1
+06:-:-:-:1      SEL blk, by, bx, P0;               // Wait Dep 2 & 3
+--:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150];
+--:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
+--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
+--:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
+
+// Initialize the portion of shared we use to zero our C registers
+// Give each warp its own address to write to.
+// All threads write to the same address, but we don't care because only one needs to take.
+// There is no bank conflict on writing to the same address, just indeterminacy in which thread will get its value stored.
+--:-:-:-:1      LOP.AND zOffset, tid, -32;
+--:-:-:-:1      STS.128 [zOffset + 4x<16*128>], RZ;
+
+// tid4   = (tid >> 5) & 3
+// tid31  = tid & 31
+// tid96  = tid & 96
+// tid128 = tid & 128
+--:-:-:-:1      BFE.U32 tid4,   tid, 0x205; // 2 bits at position 5
+--:-:-:-:1      LOP.AND tid31,  tid, 31;
+--:-:-:-:1      LOP.AND tid96,  tid, 96;
+--:-:-:-:1      LOP.AND tid128, tid, 128;
+
+// ldx4  = ldx * 4;
+// ldx8  = ldx * 8;
+--:-:-:-:1      SHR.U32 ldx, ldx4, 2;
+--:-:-:-:1      IADD ldx8, ldx4, ldx4;
+
+// track0 = blk*128/4 + tid31 + (ldx * tid4)
+--:-:-:-:1      ISCADD  track0, blk, tid31, 5;
+--:-:-:-:1      XMAD.LO track0, ldx, tid4,  track0, xmad_t0; // XMAD.LO is a macro that is expanded out into the 3 XMADs
+--:-:-:-:1      IADD track4, track0, ldx4;
+
+// writeS  = tid31*4*4 + tid4*128*4
+// writeS += 4096 if tid >= 128
+--:-:-:-:1      SHL    tid31_4, tid31, 4;
+--:-:-:-:1      ISCADD writeS, tid4, tid31_4, 9;
+--:-:-:-:1  @P0 IADD   writeS, writeS, 4x<8*128>;
+
+// int end = track0 + (k-8)*ldx;
+--:-:-:-:1      MOV k, c[0x0][0x14c];
+--:-:-:-:1      IADD k, k, -8;
+--:-:-:-:1      XMAD.LO end, k, ldx, track0, xmad_end;
+
+// readAs and readBs are carefully constructed to avoid any bank conflicts while loading from shared
+// readAs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readAs, tid128, 4;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid7;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<8*128>, 4;
+
+// Preload the first 8 lines from texture memory
+// Keep these instructions in this order (but allow others to interleave).
+// Normally the scheduler tries to preserve source order by default, but this demonstrates how you enforce
+// an ordering if you need to.
+// Note: these are the 4 element vector load versions (last param: 0xf=vec4, 0x3=vec2, 0x1=single)
+<ORDERED>
+--:-:1:-:1      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:2:-:1      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2
+</ORDERED>
+
+</SCHEDULE_BLOCK>
+
+// Initialize C registeres to zero
+// Using LDS.U.128 is a neat trick to save a few clock cyles
+// (when you have enough warps to hide the latency.)
+<CODE>
+    return join '', map sprintf("--:-:3:-:1      LDS.U.128 cz%02d, [zOffset + 4x<16*128>];\n", $_ * 4), 0..15;
+</CODE>
+
+// These instuctions need to occur after the textures load so put them in a new block
+// that starts with a dependency barrier wait.
+<SCHEDULE_BLOCK>
+
+01:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1
+02:-:-:-:1      STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2
+
+// Increment tracks after the loads are complete to avoid needing write-after-read dependencies
+--:-:-:-:1      IADD track0, track0, ldx8;
+--:-:-:-:1      IADD track4, track4, ldx8;
+
+// Wait for all threads to finish loading shared
+04:-:-:-:5      BAR.SYNC 0;
+
+</SCHEDULE_BLOCK>
+
+// The next store to shared goes to high area.
+// Having 2 share buffers allows us to eliminate a bar.sync in the main loop.
+// This way we don't have to wait for all threads to arrive before writing fresh data to shared.
+// Other threads can continue reading from the last batch while the new data is being written.
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*128>;
+
+// Preload the fist lines of A and B from shared
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1
+
+
+// The main loop
+// While calculating the first line, load in the next line from shared.
+// Shared memory stores enough to do this 8 times per loop.
+// Also pull in the next block of memory from global and store it to shared.
+
+// Efficiency:
+// ffma: 512
+// lds:  32 dual issued
+// sts:  2  dual issued
+// tex:  2  dual issued
+// add:  2
+// xor:  3
+// setp: 1
+// bar:  1  dual issued
+// bra:  1  dual issued
+// Total: 524 (512/518 = 98.8% FFMA)
+
+// Memory Throughput Upper Bound:
+// 2 * 4 * 4 bytes per thread per 518 clocks
+// 128 threads per SM
+// 16 SM's (GM204)
+// 1640Mhz (boost overclock)
+// .931 GiB/GB  (1000^3 / 1024^3)
+// 193 GiB/sec
+// Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz)
+
+LOOP:
+
+// Loop end condition
+--:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
+
+<CODE>
+
+    # We eliminated bank conflicts with our C registers and the blocking registers,
+    # but there are still 16 bank conflicts between the blocking registers themselves.
+    # By ordering the FFMA's in a swirling zigzag pattern we can completely hide those conflicts
+    # behind register reuse.  This pattern also maximizes that reuse (47%) and minimizes the bandwidth
+    # out of the register bank, thereby reducing power consumption and allowing the chip to
+    # stay at a higher sustained clock speed.  One other constraint is that we want each successive
+    # instruction to pull its third operand from alternating banks.  We space the swirl by 2 in the x
+    # direction to achieve this.  This has the effect of making it easier to avoid delayed bank conflicts
+    # with the memory operations.  Finally, for the very first ffma, don't choose one of the 16 bank conflicts
+    # as we have no way of hiding that conflict behind a reuse (cublas makes this mistake).
+
+    # Alternating banks (1320 Hz, full speed)
+    my @swirl = ([2,0],[2,1],[0,1],[0,0]);
+    my @xVals = (0,1,64,65);
+
+    # Repeating banks (1320Hz, 83 Gflops slower, but lower power draw probably becuase of increased stalls)
+    # Only explanation I can think of is increased delayed register bank conflicts with memory ops.
+    #my @swirl = ([0,1],[0,0],[1,0],[1,1]);
+    #my @xVals = (0,2,64,66);
+
+    my @cOrder;
+    foreach my $y (0,2,64,66)
+    {
+        # apply the swirl
+        foreach my $x (@xVals)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        # apply the zigzag
+        @xVals = reverse @xVals;
+    }
+
+    # This ordering (a simple zigzag) eliminates the bank conflicts but only achieves 39% reuse.
+    # It runs 20 GFlops slower since the register bank draws more power and the clock slows down to 1306 Hz.
+    # There may be more delayed bank conflicts with memory operations as the slowdown is 4 Glops more than
+    # the reduced clock accounts for.
+    #my @cOrder2;
+    #my @xVals = (0..3,64..67);
+    #foreach my $y (0..3,64..67)
+    #{
+    #    @xVals = reverse @xVals;
+    #    push @cOrder2, [$_, $y] foreach @xVals;
+    #}
+    #@cOrder = @cOrder2;
+
+    my %insert =
+    (
+        # Don't start the first TLD before 12 to let ISETP to write P0
+        # These global reads and shared writes we put exactly in the middle of the LDS ops
+        # This is to not overwhelm the memory units with instructions (and because these were tested faster here).
+        # The 4 spacing seems to work best for vec4 instructions.
+        # It's odd that these two textures loads can drive 512 FFMA's all by themselves.. but 256 threads can load 8 128 F32 wide lines.
+        # So we only need 2 to get 8 lines from both matrices.
+
+        j0c31 => "--:-:2:-:1  \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2\n",
+        j0c33 => "--:-:3:-:1  \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3\n",
+
+        j6c30 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2\n",
+        j6c34 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3\n",
+
+        # We need one barrier in the main loop after writing shared memory.
+        # The barrier is needed even if this is our last loop because we need to protect the warp shuffle step.
+        # Note, BAR.SYNCs do not sync memory read access automatically, you still need to flag the barriers (writes are sync'd).
+        # After the BAR, swap our share buffer location.  We don't need an additional barrier because of these swaps.
+        # Note, this doubles our shared memory usage but this kernel's occupancy is entirely bound by registers.
+        # LOP.XOR readAs needs to be 4 clocks prior to the LDS.U.128 for readAs (but push this as far down as possible)
+        j6c62 =>
+                "01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<16*128>;\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<16*128>;\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<16*128>;\n",
+
+        # Note having 2 IADDs slightly hits our FFMA performance (1/518 = .2%), but TLD doesn't take an offset.
+        # LDG.CI doesn't have this issue, but doesn't give you the nice features of texture loads:
+        #   -Boundry Clamping:  simplifies our matrix load logic so we don't need to worry about loading out of bounds
+        #   -Normalized Floats: if we don't need full 32 bits of precision we could store our matrices using 16 or 8 bit values
+        j7c63 =>
+                "--:-:-:-:1  \@P0 IADD track0, track0, ldx8;\n" .
+                "--:-:-:-:0  \@P0 IADD track4, track4, ldx8;\n" .
+                "--:-:-:Y:5  \@P0 BRA LOOP;\n",
+    );
+
+    my $out;
+    # We unroll our main loop 8 iterations.
+    # This gives us a loop instruction count of 556.  Add the control instructions and that makes it 741 opcodes sized 8 bytes.
+    # This is 5928 bytes, nicely fitting inside the 8kb instruction cache.  Going to the next biggest size would be 12 lines.
+    # That would be 768 ffmas and not leaving enough room for the other instructions and control codes.
+    # So by staying inside the instruction cache size, we avoid hitting any instruction fetch latencies.
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        # Our rolling blocking registers stay one load ahead off the FFMA's (rs: read share)
+        my $rsOffset = ($j + 1) % 8;
+        # No need to load on last loop iteration
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        # You can experiment here with different vector load sizes
+        my $vec = 128;
+
+        if ($vec == 128)
+        {
+            # Roll up our LDS ops here to keep them easier to manage and tune
+            # Space at every other clock to maximize throughput.
+            $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBy64, [readBs + 4x<%d*128 + 64>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
+        }
+        elsif ($vec == 64)
+        {
+            # LDS.64 runs about 22 Gflops slower than LDS.128 (GM107).  Not a huge difference since our latencies are so well hidden.
+            # I think LDS.128 is implemented internally as a pair of LDS.64 ops which could be another reason for the comparable performance.
+            # I think the big benefit with 128 is being able to issue all our LDS ops earlier, allowing more FFMA's prior to reading out the results.
+            # There could also be additional opportunity for delayed bank conflicts.
+            $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c6"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c8"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c10"} = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c12"} = sprintf "--:-:-:-:1  %s LDS.U.64 j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c14"} = sprintf "--:-:1:-:1  %s LDS.U.64 j%dBy66, [readBs + 4x<%d*128 + 66>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
+        }
+        else
+        {
+            # This one drops performance by over 200 Gflops.  So you want to at least use LDS.64 if you can.
+            # We don't even have room to properly space these at half throuput.
+            $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c1"}  = sprintf "--:-:-:-:1  %s LDS j%dAx01, [readAs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c3"}  = sprintf "--:-:-:-:1  %s LDS j%dAx03, [readAs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c5"}  = sprintf "--:-:-:-:1  %s LDS j%dBy01, [readBs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c6"}  = sprintf "--:-:-:-:1  %s LDS j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c7"}  = sprintf "--:-:-:-:1  %s LDS j%dBy03, [readBs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c8"}  = sprintf "--:-:-:-:1  %s LDS j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c9"}  = sprintf "--:-:-:-:1  %s LDS j%dAx65, [readAs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c10"} = sprintf "--:-:-:-:1  %s LDS j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c11"} = sprintf "--:-:-:-:1  %s LDS j%dAx67, [readAs + 4x<%d*128 + 67>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c12"} = sprintf "--:-:-:-:1  %s LDS j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c13"} = sprintf "--:-:-:-:1  %s LDS j%dBy65, [readBs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c14"} = sprintf "--:-:-:-:1  %s LDS j%dBy66, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+            $insert{"j${j}c15"} = sprintf "--:-:1:-:1  %s LDS j%dBy67, [readBs + 4x<%d*128 + 67>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
+        }
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            # Grab an instruction for insertion if one exists for this j and c combination
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            # Scatter some yields in there to better balance the workload and reduce sync stalls
+            # Don't pair a yeild with the dual issued ffmas as that kills performance for some reason
+            ##### This no longer offers extra performance on GM204 as it did on GM107.  It still does for the 64 thread version. Keeping since it doesn't hurt. ####
+            my $yield  = $c == 32 ? 'Y' : '-';
+
+            # The first FFMA needs to wait on the prior loop's LDS.U.128 ops to finish (except if the barrier does the wait for us)
+            my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--','');
+
+            # Dual issue these ops
+            my $stall  = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            # output our FFMA and also any inserted ops
+            $out .= sprintf "%s      FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $comment,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+// Main loop is done, time to write C to global memory.
+<SCHEDULE_BLOCK>
+
+// Remove the high bits if present from the last loop's xor.
+// Also remove the 4096 added onto readBs.
+// This gives us the x and y coordinates of the start of this thread's data in C.
+--:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
+
+// Remap readAs and readBs onto writeCs so we can shuffle the output for coalesced global writes.
+// readAs stays constant, readBs colapses down from stride 4 to 1
+// writeCs = (readBs / 4) * 128 + readAs;
+--:-:-:-:1      ISCADD  writeCs, readBs, readAs, 5;
+
+// Read out the C values from shared in a simple tid mapped pattern but
+// offset by the position of this warp's colapsed data in shared.
+
+// cx = tid31 | (tid128 >> 2);
+--:-:-:-:1      SHR.U32  cx, tid128, 2;
+--:-:-:-:1      LOP.OR   cx, tid31,  cx;
+
+// readCs = ((tid96 << 4) | cx) << 2;
+--:-:-:-:1      SHL      readCs, tid96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, cx;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// cx += bx*128;
+--:-:-:-:1      ISCADD  cx, bx, cx, 7;
+
+// cy = by*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, by, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+--:-:-:-:1      MOV ldc, c[0x0][0x158];
+--:-:-:-:1      XMAD.LO ci, cy00, ldc, cx, xmad_ci;
+--:-:-:-:1      ISCADD  Cy00, ci, c[0x0][0x140], 2;
+
+// When writing in assembly, being able to 'printf' is sometimes easier than stepping through the debugger.
+// Here's how it's done.  Drop something like this in your code. Then modify the c code to accept this
+// many params per thread to printf (see assemblySgemm function).
+
+//--:-:-:-:1      SHR.U32  smId, smId, 20;
+
+// D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4
+// D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4
+//--:-:-:-:1      MOV gridDimX, c[0x0][0x14];
+//--:-:-:-:1      MOV blckDimX, c[0x0][0x8];
+//--:-:-:-:1      XMAD.LO D, by, gridDimX, bx, xmad_D;
+//--:-:-:-:1      XMAD.LO D, D, blckDimX, tid, xmad_D;
+//--:-:-:-:1      ISCADD D, D, c[0x0][0x160], 3; // 4 bytes * 2 vars = 8 or shift 3
+
+//--:-:-:-:1      STG.CS [D + 4x<0>], readAs;
+//--:-:-:-:1      STG.CS [D + 4x<1>], readBs;
+//--:-:-:-:1      STG.CS [D + 4x<2>], writeCs;
+//--:-:-:-:1      STG.CS [D + 4x<3>], readCs;
+//--:-:-:-:1      STG.CS [D + 4x<4>], cx;
+//--:-:-:-:1      STG.CS [D + 4x<5>], cy00;
+//--:-:-:-:1      STG.CS [D + 4x<6>], ci;
+//--:-:-:-:1      STG.CS [D + 4x<7>], cx67y67;
+
+//--:-:-:-:1      STG.CS [D + 4x<0>], smId;
+//--:-:-:-:1      STG.CS [D + 4x<1>], clock;
+
+
+// Setup our matrix bounds checking vars and preds
+// Bounds checking is what allows this code to work on matrix sizes not a multiple of 128
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
+--:-:-:-:1      IADD cx, cx, 64;
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
+
+--:-:-:-:1      IADD cy00, cy00, -1;
+--:-:-:-:1      IADD cy04, cy00,  4;
+--:-:-:-:1      IADD cy08, cy00,  8;
+--:-:-:-:1      IADD cy12, cy00,  12;
+
+// Setup our C output addresses and increments.
+--:-:-:-:1      SHL  ldc1,  ldc, 2;
+--:-:-:-:1      SHL  ldc4,  ldc, 4;
+--:-:-:-:1      SHL  ldc8,  ldc, 5;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+
+// Load the first set of the STORE_C subroutine params in the scheduled block.
+# This is also a good time to apply alpha.
+--:-:-:-:1      MOV alpha, c[0x0][0x15c];
+
+--:-:-:-:1      FMUL cs0, cx00y00, alpha;
+--:-:-:-:1      FMUL cs1, cx01y00, alpha;
+--:-:-:-:1      FMUL cs2, cx02y00, alpha;
+--:-:-:-:1      FMUL cs3, cx03y00, alpha;
+--:-:-:-:1      FMUL cs4, cx64y00, alpha;
+--:-:-:-:1      FMUL cs5, cx65y00, alpha;
+--:-:-:-:1      FMUL cs6, cx66y00, alpha;
+--:-:-:-:1      FMUL cs7, cx67y00, alpha;
+
+// We pre-increment the output addresses so they can be dual issued with memory ops
+// So start with a -1 instead of 0 value.
+--:-:-:-:1      IADD Cy00, Cy00, -ldc1;
+--:-:-:-:1      IADD Cy04, Cy00, ldc4;
+--:-:-:-:1      IADD Cy08, Cy00, ldc8;
+--:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
+
+</SCHEDULE_BLOCK>
+
+// There's nothing yet in place to handle dependecies with subroutines.
+// So don't schedule this block.
+<CODE>
+
+    my $out;
+    foreach my $y (0..3, 64..67)
+    {
+        my ($wait, $comment) = $y == 64 ? ('--', '') : ('02',' // Wait Dep 2');
+
+        # Jump ahead 60 units (to get to the values at y=64)
+        $out .=
+            "--:-:-:-:1      IADD cy00, cy00, 60;\n" .
+            "--:-:-:-:1      IADD cy04, cy04, 60;\n" .
+            "--:-:-:-:1      IADD cy08, cy08, 60;\n" .
+            "--:-:-:-:1      IADD cy12, cy12, 60;\n\n" .
+
+            "02:-:-:-:1      IADD Cy00, Cy00, ldc60; // Wait Dep 2\n" .
+            "--:-:-:-:1      IADD Cy04, Cy04, ldc60;\n" .
+            "--:-:-:-:1      IADD Cy08, Cy08, ldc60;\n" .
+            "--:-:-:-:1      IADD Cy12, Cy12, ldc60;\n\n"  if $y == 64;
+
+        # We need to move the C values to the param registers of the STORE_C subroutine.
+        # This is also a good time to apply alpha.
+        $out .= sprintf(
+            "%s:-:-:-:1      FMUL cs0, cx00y%02d, alpha;%s\n" .
+            "--:-:-:-:1      FMUL cs1, cx01y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs2, cx02y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs3, cx03y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs4, cx64y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs5, cx65y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs6, cx66y%02d, alpha;\n" .
+            "--:-:-:-:0      FMUL cs7, cx67y%02d, alpha; // Dual Issue\n",
+            $wait, $y, $comment, ($y) x 7) if $y;
+
+        # Call the subroutine.
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+// And we'd done.  The remainder is the STORE_C subroutine that's defined at the end of the kernel.
+--:-:-:-:5      EXIT;
+
+// This routine does warp synchronous shuffling of our output data so as to be able
+// to have coalesced writes to global memory.  This is actually faster because the shared
+// memory latencies can be hidden by other warps and we're only adding a few extra clocks
+// to this thread.  Global memory here is the bottleneck and being able to half the needed
+// bandwidth at the expense of a few clocks is a modest win.  This also keeps power lower
+// and our chip running faster.
+
+// Note, the SHFL instruction doesn't help us here because we're swaping different registers
+// from different threads.
+STORE_C:
+
+<SCHEDULE_BLOCK>
+
+// Each warp writes to its own region of memory so we don't need to bar.sync the access.
+// There are some bank conflicts here on the STS.128s but no way to avoid them, and the hit just means a few extra clocks.
+// Note here that the scheduler is able to handle the dependencies between vector and non-vector instructions.
+// It knows from the instruction type and the register map that cs0 here includes cs1, cs2 and cs3 as well.
+--:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], cs4;
+
+// In a single warp, loads naturally occur after the store to shared completes, no sync required.
+--:-:-:-:1      LDS cs0, [readCs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS cs1, [readCs + 4x<0*128 + 64>];
+--:-:-:-:1      LDS cs2, [readCs + 4x<1*128 + 00>];
+--:-:-:-:1      LDS cs3, [readCs + 4x<1*128 + 64>];
+--:-:-:-:1      LDS cs4, [readCs + 4x<2*128 + 00>];
+--:-:-:-:1      LDS cs5, [readCs + 4x<2*128 + 64>];
+--:-:-:-:1      LDS cs6, [readCs + 4x<3*128 + 00>];
+--:-:1:-:1      LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:1      IADD cy12, cy12, 1;
+
+--:-:-:-:1      IADD Cy00, Cy00, ldc1;
+--:-:-:-:1      IADD Cy04, Cy04, ldc1;
+--:-:-:-:1      IADD Cy08, Cy08, ldc1;
+--:-:-:-:1      IADD Cy12, Cy12, ldc1;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m
+
+01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
+--:-:-:-:1  @P1 STG.CG [Cy00 + 4x<64>], cs1;
+--:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
+--:-:-:-:1  @P3 STG.CG [Cy04 + 4x<64>], cs3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m
+
+--:-:-:-:1  @P0 STG.CG [Cy08 + 4x<00>], cs4;
+--:-:-:-:1  @P1 STG.CG [Cy08 + 4x<64>], cs5;
+--:-:-:-:1  @P2 STG.CG [Cy12 + 4x<00>], cs6;
+--:2:-:-:1  @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
diff --git a/Assembler/PascalAs/sgemm/sgemm64.sass b/Assembler/PascalAs/sgemm/sgemm64.sass
new file mode 100644
index 0000000..f037b3e
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm64.sass
@@ -0,0 +1,398 @@
+# Kernel: sgemm_kernel_64
+#
+# SharedSize: 8192
+# Params(8):
+#   0:0x140:4:4 param_C,
+#   1:0x144:4:0 param_m,
+#   2:0x148:4:0 param_n,
+#   3:0x14c:4:0 param_k,
+#   4:0x150:4:0 param_lda,
+#   5:0x154:4:0 param_ldb,
+#   6:0x158:4:0 param_ldc
+#   7:0x15c:4:0 param_alpha
+#   8:0x160:4:4 param_D // for diagnostic printf output
+#
+# Globals:
+#   c[0x0][0x164]: texA (the value is 1)
+#   c[0x0][0x168]: texB (the value is 0)
+
+<REGISTER_MAPPING>
+
+    0-63    ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end
+
+    80      : zOffset
+    0-63    : cz<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35>
+    35,34,43,42,51,50,59,58 : cx32y<00-03|32-35>
+    39,38,47,46,55,54,63,62 : cx33y<00-03|32-35>
+    33,32,41,40,49,48,57,56 : cx34y<00-03|32-35>
+    37,36,45,44,53,52,61,60 : cx35y<00-03|32-35>
+
+    64-79   : j0Ax<00-03|32-35>, j0By<00-03|32-35>
+    80-95   : j1Ax<00-03|32-35>, j1By<00-03|32-35>
+
+    64-71   : cs<0-7>
+
+    96-111  : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3>
+
+    112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32
+
+    72-111  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
+--:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
+--:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
+
+<SCHEDULE_BLOCK>
+
+// blk = tid >= 32 ? by   : bx;
+// ldx = tid >= 32 ? ldb  : lda;
+// tex = tid >= 32 ? texB : texA;
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1
+06:-:-:-:1      SEL blk, by, bx, P0;              // Wait Dep 2 & 3
+--:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150];
+--:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
+--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
+--:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
+
+--:-:-:-:1      LOP.AND zOffset, tid, -32;
+--:-:-:-:1      STS.128 [zOffset + 4x<16*64>], RZ;
+
+// tid2   = (tid >> 4) & 1
+// tid15  = tid & 15
+// tid31 = tid & 31
+// tid32 = tid & 32
+--:-:-:-:1      BFE.U32 tid2,  tid, 0x104; // 1 bit at position 4
+--:-:-:-:1      LOP.AND tid15, tid, 15;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      LOP.AND tid32, tid, 32;
+
+// ldx4  = ldx * 4;
+// ldx8  = ldx * 8;
+--:-:-:-:1      SHR.U32 ldx, ldx4, 2;
+--:-:-:-:1      IADD ldx8, ldx4, ldx4;
+
+// track0 = blk*64/4 + tid15 + (ldx * tid2)
+--:-:-:-:1      ISCADD  track0, blk, tid15, 4;
+--:-:-:-:1      XMAD.LO track0, ldx, tid2,  track0, xmad_t0;
+--:-:-:-:1      IADD3 track2, track0, ldx, ldx;
+--:-:-:-:1      IADD track4, track0, ldx4;
+--:-:-:-:1      IADD track6, track2, ldx4;
+
+// writeS = tid15*4*4 + tid2*64*4
+--:-:-:-:1      SHL    tid15_4, tid15, 4;
+--:-:-:-:1      ISCADD writeS, tid2, tid15_4, 8;
+
+// writeS += 2048 if tid >= 32
+--:-:-:-:1  @P0 IADD   writeS, writeS, 4x<8*64>;
+
+// int end = track0 + (k-8)*ldx;
+--:-:-:-:1      MOV k, c[0x0][0x14c];
+--:-:-:-:1      IADD k, k, -8;
+--:-:-:-:1      XMAD.LO end, k, ldx, track0, xmad_end;
+
+// readAs = ((tid >> 1) & 7) << 4;
+--:-:-:-:1      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 2048;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x30;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<8*64>, 4;
+
+<ORDERED>
+--:-:1:-:1      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:2:-:1      TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:3:-:1      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:4:-:1      TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2
+</ORDERED>
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    return join '', map sprintf("--:-:5:-:1      LDS.U.128 cz%02d, [zOffset + 4x<16*64>];\n", $_ * 4), 0..15;
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+01:-:-:-:1      STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1
+02:-:-:-:1      STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2
+04:-:-:-:1      STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3
+08:-:-:-:1      STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4
+
+--:-:-:-:1      IADD track0, track0, ldx8;
+--:-:-:-:1      IADD track2, track2, ldx8;
+--:-:-:-:1      IADD track4, track4, ldx8;
+--:-:-:-:1      IADD track6, track6, ldx8;
+
+10:-:-:-:5      BAR.SYNC 0;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*64>;
+
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1
+
+// Efficiency:
+// ffma: 512
+// lds:  32 dual issued
+// sts:  4  dual issued
+// tex:  4  dual issued
+// add:  4
+// xor:  3
+// setp: 1
+// bar:  1  dual issued
+// bra:  1  dual issued
+// Total: 520 (512/520 = 98.5% FFMA)
+
+LOOP:
+
+// Loop end condition
+--:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
+
+<CODE>
+
+    my @cOrder;
+    my @swirl = ([2,0],[2,1],[0,1],[0,0]);
+    my @x = (0,1,32,33);
+    foreach my $y (0,2,32,34)
+    {
+        foreach my $x (@x)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @x = reverse @x;
+    }
+
+    my %insert =
+    (
+        j0c31 => "--:-:-:-:1  \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf;\n",
+        j0c33 => "--:-:2:-:1  \@P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2\n",
+
+        j1c31 => "--:-:-:-:1  \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf;\n",
+        j1c33 => "--:-:3:-:1  \@P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3\n",
+
+        j5c30 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2\n",
+        j5c34 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<2*64>], loadX2;\n",
+
+        j6c30 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3\n",
+        j6c34 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*64>], loadX6;\n",
+
+        j6c62 =>
+                "01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<16*64>;\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<16*64>;\n" .
+                "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<16*64>;\n",
+
+        j7c63 =>
+                "--:-:-:-:1  \@P0 IADD track0, track0, ldx8;\n" .
+                "--:-:-:-:1  \@P0 IADD track2, track2, ldx8;\n" .
+                "--:-:-:-:1  \@P0 IADD track4, track4, ldx8;\n" .
+                "--:-:-:-:0  \@P0 IADD track6, track6, ldx8;\n" .
+                "--:-:-:Y:5  \@P0 BRA LOOP;\n",
+    );
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx00, [readAs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBy00, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx32, [readAs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBy32, [readBs + 4x<%d*64 + 32>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $yield  = $c == 32 ? 'Y' : '-';
+
+            my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--','');
+
+            my $stall  = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $comment,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND readAs, readAs, 0x7ff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0x7ff;
+
+// writeCs = (readBs / 4) * 64 + readAs;
+--:-:-:-:1      ISCADD  writeCs, readBs, readAs, 4;
+
+// readCs = ((tid32 << 3) + tid31) << 2;
+--:-:-:-:1      ISCADD  readCs, tid32,  tid31, 3;
+--:-:-:-:1      SHL     readCs, readCs, 2;
+
+// cx = bx*64 + tid31;
+--:-:-:-:1      ISCADD  cx, bx, tid31, 6;
+
+// cy = by*64 + (tid32 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid32, 1;
+--:-:-:-:1      ISCADD  cy00, by, cy00, 6;
+
+// C += (cy*ldc + cx) * 4;
+--:-:-:-:1      MOV ldc, c[0x0][0x158];
+--:-:-:-:1      XMAD.LO ci, cy00, ldc, cx, xmad_ci;
+--:-:-:-:1      ISCADD  Cy00, ci, c[0x0][0x140], 2;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
+--:-:-:-:1      IADD cx, cx, 32;
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
+
+// D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4
+// D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4
+//--:-:-:-:1      MOV gridDimX, c[0x0][0x14];
+//--:-:-:-:1      MOV blckDimX, c[0x0][0x8];
+//--:-:-:-:1      XMAD.LO D, by, gridDimX, bx, xmadD;
+//--:-:-:-:1      XMAD.LO D, D, blckDimX, tid, xmadD;
+//--:-:-:-:1      ISCADD D, D, c[0x0][0x160], 5; // 4 bytes * 8 vars = 32 or shift 5
+
+//--:-:-:-:1      STG.CS [D + 4x<0>], readAs;
+//--:-:-:-:1      STG.CS [D + 4x<1>], readBs;
+//--:-:-:-:1      STG.CS [D + 4x<2>], writeCs;
+//--:-:-:-:1      STG.CS [D + 4x<3>], readCs;
+//--:-:-:-:1      STG.CS [D + 4x<4>], cx;
+//--:-:-:-:1      STG.CS [D + 4x<5>], cy00;
+//--:-:-:-:1      STG.CS [D + 4x<6>], ci;
+//--:-:-:-:1      STG.CS [D + 4x<7>], cx35y35;
+
+--:-:-:-:1      IADD cy00, cy00, -1;
+--:-:-:-:1      IADD cy04, cy00,  4;
+--:-:-:-:1      IADD cy08, cy00,  8;
+--:-:-:-:1      IADD cy12, cy00,  12;
+
+--:-:-:-:1      SHL  ldc1,  ldc, 2;
+--:-:-:-:1      SHL  ldc4,  ldc, 4;
+--:-:-:-:1      SHL  ldc8,  ldc, 5;
+--:-:-:-:1      ISCADD ldc28, ldc, -ldc4, 7;
+
+--:-:-:-:1      MOV alpha, c[0x0][0x15c];
+--:-:-:-:1      FMUL cs0, cx00y00, alpha;
+--:-:-:-:1      FMUL cs1, cx01y00, alpha;
+--:-:-:-:1      FMUL cs2, cx02y00, alpha;
+--:-:-:-:1      FMUL cs3, cx03y00, alpha;
+--:-:-:-:1      FMUL cs4, cx32y00, alpha;
+--:-:-:-:1      FMUL cs5, cx33y00, alpha;
+--:-:-:-:1      FMUL cs6, cx34y00, alpha;
+--:-:-:-:1      FMUL cs7, cx35y00, alpha;
+
+--:-:-:-:1      IADD Cy00, Cy00, -ldc1;
+--:-:-:-:1      IADD Cy04, Cy00, ldc4;
+--:-:-:-:1      IADD Cy08, Cy00, ldc8;
+--:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..3, 32..35)
+    {
+        my ($wait, $comment) = $y == 32 ? ('--', '') : ('02',' // Wait Dep 2');
+
+        $out .=
+            "--:-:-:-:1      IADD cy00, cy00, 28;\n" .
+            "--:-:-:-:1      IADD cy04, cy04, 28;\n" .
+            "--:-:-:-:1      IADD cy08, cy08, 28;\n" .
+            "--:-:-:-:1      IADD cy12, cy12, 28;\n\n" .
+
+            "02:-:-:-:1      IADD Cy00, Cy00, ldc28; // Wait Dep 2\n" .
+            "--:-:-:-:1      IADD Cy04, Cy04, ldc28;\n" .
+            "--:-:-:-:1      IADD Cy08, Cy08, ldc28;\n" .
+            "--:-:-:-:1      IADD Cy12, Cy12, ldc28;\n\n"  if $y == 32;
+
+        $out .= sprintf(
+            "%s:-:-:-:1      FMUL cs0, cx00y%02d, alpha;%s\n" .
+            "--:-:-:-:1      FMUL cs1, cx01y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs2, cx02y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs3, cx03y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs4, cx32y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs5, cx33y%02d, alpha;\n" .
+            "--:-:-:-:1      FMUL cs6, cx34y%02d, alpha;\n" .
+            "--:-:-:-:0      FMUL cs7, cx35y%02d, alpha; // Dual Issue\n",
+            $wait, $y, $comment, ($y) x 7) if $y;
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], cs4;
+
+--:-:-:-:1      LDS cs0, [readCs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS cs1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:1      LDS cs2, [readCs + 4x<1*64 + 00>];
+--:-:-:-:1      LDS cs3, [readCs + 4x<1*64 + 32>];
+--:-:-:-:1      LDS cs4, [readCs + 4x<2*64 + 00>];
+--:-:-:-:1      LDS cs5, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      LDS cs6, [readCs + 4x<3*64 + 00>];
+--:-:1:-:1      LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:1      IADD cy12, cy12, 1;
+
+--:-:-:-:1      IADD Cy00, Cy00, ldc1;
+--:-:-:-:1      IADD Cy04, Cy04, ldc1;
+--:-:-:-:1      IADD Cy08, Cy08, ldc1;
+--:-:-:-:1      IADD Cy12, Cy12, ldc1;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m
+
+01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
+--:-:-:-:1  @P1 STG.CG [Cy00 + 4x<32>], cs1;
+--:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
+--:-:-:-:1  @P3 STG.CG [Cy04 + 4x<32>], cs3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m
+
+--:-:-:-:1  @P0 STG.CG [Cy08 + 4x<00>], cs4;
+--:-:-:-:1  @P1 STG.CG [Cy08 + 4x<32>], cs5;
+--:-:-:-:1  @P2 STG.CG [Cy12 + 4x<00>], cs6;
+--:2:-:-:1  @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
diff --git a/Assembler/PascalAs/sgemm/sgemm_final_128.sass b/Assembler/PascalAs/sgemm/sgemm_final_128.sass
new file mode 100644
index 0000000..ce7b0e7
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm_final_128.sass
@@ -0,0 +1,793 @@
+# Kernel: sgemm_kernel_128
+# Arch: sm_50
+# InsCnt: 770
+# RegCnt: 118
+# SharedSize: 16384
+# BarCnt: 1
+# Params(9):
+#	ord:addr:size:align
+#	0:0x140:4:0
+#	1:0x144:4:0
+#	2:0x148:4:0
+#	3:0x14c:4:0
+#	4:0x150:4:0
+#	5:0x154:4:0
+#	6:0x158:4:0
+#	7:0x15c:4:0
+#	8:0x160:4:0
+#
+# Instructions:
+
+--:-:1:-:1      S2R R112, SR_TID.X;
+--:-:2:-:1      S2R R113, SR_CTAID.X;
+--:-:3:-:1      S2R R114, SR_CTAID.Y;
+01:-:-:Y:1      ISETP.GE.AND P0, PT, R112.reuse, 0x80, PT;
+--:-:-:-:1      LOP.AND R117, R112.reuse, 0x1f;
+--:-:-:-:1      BFE.U32 R9, R112.reuse, 0x205;
+--:-:-:-:1      MOV R13, c[0x0][0x14c];
+--:-:-:-:1      BFE.U32 R4, R112.reuse, 0x301;
+--:-:-:-:1      LOP.AND R115, R112.reuse, 0x80;
+--:-:-:-:1      LOP.AND R107, R112.reuse, 0x70;
+--:-:-:-:1      SHL R16, R117, 0x4;
+--:-:-:-:1      LOP.AND R0, R112.reuse, 0x1;
+--:-:-:-:1      IADD R13, R13, -0x8;
+--:-:-:-:1      LOP.AND R80, R112.reuse, -0x20;
+--:-:-:-:1      SHR.U32 R106, R115, 0x4;
+--:-:-:-:1      LOP.AND R116, R112, 0x60;
+--:-:-:-:1      SHR.U32 R107, R107, 0x3;
+--:-:-:-:0 @!P0 MOV R1, c[0x0][0x150];
+--:-:-:-:1      STS.128 [R80+0x2000], RZ;
+--:-:-:-:1  @P0 MOV R1, c[0x0][0x154];
+--:-:-:-:1      ISCADD R111, R9, R16, 0x9;
+06:-:-:-:1      SEL R12, R114, R113, P0;
+--:-:-:-:1 @!P0 MOV32I R110, 0x80000001;
+--:-:-:-:1  @P0 MOV32I R110, 0x80000000;
+--:-:-:-:1      LOP.OR R106, R106, R4;
+--:-:-:-:1      SHR.U32 R8, R1.reuse, 0x2;
+--:-:-:-:1      LOP.OR R107, R107, R0;
+--:-:-:-:1      ISCADD R104, R12, R117, 0x5;
+--:-:-:-:1      IADD R109, R1, R1;
+--:-:-:-:1  @P0 IADD R111, R111, 0x1000;
+--:-:-:-:1      SHL R106, R106, 0x4;
+--:-:-:-:1      XMAD.MRG R5, R8.reuse, R9.H1.reuse, RZ;
+--:-:-:-:1      ISCADD R107, R107, 0x1000, 0x4;
+--:-:-:-:1      XMAD R104, R8.reuse, R9, R104;
+--:-:-:Y:5      XMAD.MRG R20, R13.reuse, R8.H1.reuse, RZ;
+--:-:-:-:2      XMAD.PSL.CBCC R104, R8.H1, R5.H1, R104;
+--:-:1:-:4      TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf;
+--:-:-:-:1      IADD R108, R104, R1;
+--:-:-:-:1      XMAD R105, R13.reuse, R8, R104;
+--:-:2:Y:5      TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf;
+--:-:-:-:1      XMAD.PSL.CBCC R105, R13.H1, R20.H1, R105;
+--:-:3:-:1      LDS.U.128 R0, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R4, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R8, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R12, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R16, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R20, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R24, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R28, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R32, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R36, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R40, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R44, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R48, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R52, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R56, [R80+0x2000];
+--:-:3:-:1      LDS.U.128 R60, [R80+0x2000];
+01:-:-:-:1      STS.128 [R111], R96;
+--:-:-:-:0      IADD R104, R104, R109.reuse;
+02:-:-:-:1      STS.128 [R111+0x800], R100;
+--:-:-:-:0      IADD R108, R108, R109;
+04:-:-:-:5      BAR.SYNC 0x0;
+--:-:-:-:0      LOP.XOR R111, R111, 0x2000;
+--:-:-:-:1      LDS.U.128 R64, [R106];
+--:-:-:-:1      LDS.U.128 R72, [R107];
+--:-:-:-:1      LDS.U.128 R68, [R106+0x100];
+--:-:1:-:1      LDS.U.128 R76, [R107+0x100];
+TARGET1:
+--:-:-:-:1      ISETP.LE.AND P0, PT, R104, R105, PT;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R106+0x200];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R107+0x200];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R106+0x300];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R107+0x300];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:0      FFMA R11, R64.reuse, R74, R11;
+--:-:2:-:1  @P0 TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:0      FFMA R16, R66, R77.reuse, R16;
+--:-:3:-:1  @P0 TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R106+0x400];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R107+0x400];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R106+0x500];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R107+0x500];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R106+0x600];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R107+0x600];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R106+0x700];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R107+0x700];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R106+0x800];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R107+0x800];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R106+0x900];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R107+0x900];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R106+0xa00];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R107+0xa00];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R106+0xb00];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R107+0xb00];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R106+0xc00];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R107+0xc00];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R106+0xd00];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R107+0xd00];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R106+0xe00];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R107+0xe00];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R106+0xf00];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R107+0xf00];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:0      FFMA R10, R64.reuse, R75, R10;
+02:-:-:-:1  @P0 STS.128 [R111], R96;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:0      FFMA R18, R64.reuse, R77.reuse, R18;
+04:-:-:-:1  @P0 STS.128 [R111+0x800], R100;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:0      FFMA R26, R64.reuse, R79, R26;
+01:-:-:-:5      BAR.SYNC 0x0;
+--:-:-:-:1  @P0 LOP.XOR R106, R106, 0x2000;
+--:-:-:-:1  @P0 LOP.XOR R107, R107, 0x2000;
+--:-:-:-:1  @P0 LOP.XOR R111, R111, 0x2000;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+--:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1  @P0 LDS.U.128 R64, [R106];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1  @P0 LDS.U.128 R72, [R107];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1  @P0 LDS.U.128 R68, [R106+0x100];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1  @P0 LDS.U.128 R76, [R107+0x100];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+--:-:-:-:1  @P0 IADD R104, R104, R109.reuse;
+--:-:-:-:0  @P0 IADD R108, R108, R109;
+--:-:-:Y:5  @P0 BRA TARGET1;
+--:-:-:-:1      SHR.U32 R84, R115, 0x2;
+--:-:-:-:1      MOV R77, c[0x0][0x158];
+--:-:-:-:1      SHR.U32 R80, R116.reuse, 0x1;
+--:-:-:-:1      MOV R72, c[0x0][0x15c];
+--:-:-:-:1      SHL R89, R116, 0x4;
+--:-:-:-:1      LOP.AND R106, R106, 0xfff;
+--:-:-:-:1      LOP.OR R84, R117, R84;
+--:-:-:-:1      SHL R81, R77.reuse, 0x2;
+--:-:-:-:1      LOP.AND R107, R107, 0xfff;
+--:-:-:-:1      ISCADD R80, R114, R80, 0x7;
+--:-:-:-:1      FMUL R64, R3, R72.reuse;
+--:-:-:-:1      SHL R74, R77.reuse, 0x4;
+--:-:-:-:1      LOP.OR R89, R89, R84;
+--:-:-:-:1      ISCADD R84, R113, R84, 0x7;
+--:-:-:-:1      FMUL R65, R7, R72.reuse;
+--:-:-:-:1      SHL R88, R77, 0x5;
+--:-:-:-:1      XMAD.MRG R78, R80.reuse, R77.H1.reuse, RZ;
+--:-:-:-:1      ISCADD R90, R107, R106, 0x5;
+--:-:-:-:1      FMUL R66, R1, R72.reuse;
+--:-:-:-:1      SHL R89, R89, 0x2;
+--:-:-:-:1      XMAD R73, R80, R77, R84;
+--:-:-:-:1      ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT;
+--:-:-:-:1      IADD R84, R84, 0x40;
+--:-:-:-:1      ISCADD R85, R77, -R74, 0x8;
+--:-:-:-:1      FMUL R67, R5, R72.reuse;
+--:-:-:-:1      FMUL R68, R35, R72.reuse;
+--:-:-:-:1      XMAD.PSL.CBCC R73, R80.H1, R78.H1, R73;
+--:-:-:-:1      IADD R80, R80, -0x1;
+--:-:-:-:1      ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT;
+--:-:-:-:1      FMUL R69, R39, R72.reuse;
+--:-:-:-:1      FMUL R70, R33, R72.reuse;
+--:-:-:-:1      FMUL R71, R37, R72;
+--:-:-:-:1      ISCADD R76, R73, c[0x0][0x140], 0x2;
+--:-:-:-:1      IADD R83, R80.reuse, 0x4;
+--:-:-:-:1      IADD R86, R80.reuse, 0x8;
+--:-:-:-:3      IADD R87, R80, 0xc;
+--:-:-:Y:6      IADD R76, R76, -R81;
+--:-:-:-:1      IADD R75, R76.reuse, R74;
+--:-:-:Y:5      IADD R79, R76, R88.reuse;
+--:-:-:-:0      IADD R82, R75, R88;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R2, R72.reuse;
+--:-:-:-:1      FMUL R65, R6, R72.reuse;
+--:-:-:-:1      FMUL R66, R0, R72.reuse;
+--:-:-:-:1      FMUL R67, R4, R72.reuse;
+--:-:-:-:1      FMUL R68, R34, R72.reuse;
+--:-:-:-:1      FMUL R69, R38, R72.reuse;
+--:-:-:-:1      FMUL R70, R32, R72.reuse;
+--:-:-:-:0      FMUL R71, R36, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R11, R72.reuse;
+--:-:-:-:1      FMUL R65, R15, R72.reuse;
+--:-:-:-:1      FMUL R66, R9, R72.reuse;
+--:-:-:-:1      FMUL R67, R13, R72.reuse;
+--:-:-:-:1      FMUL R68, R43, R72.reuse;
+--:-:-:-:1      FMUL R69, R47, R72.reuse;
+--:-:-:-:1      FMUL R70, R41, R72.reuse;
+--:-:-:-:0      FMUL R71, R45, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R10, R72.reuse;
+--:-:-:-:1      FMUL R65, R14, R72.reuse;
+--:-:-:-:1      FMUL R66, R8, R72.reuse;
+--:-:-:-:1      FMUL R67, R12, R72.reuse;
+--:-:-:-:1      FMUL R68, R42, R72.reuse;
+--:-:-:-:1      FMUL R69, R46, R72.reuse;
+--:-:-:-:1      FMUL R70, R40, R72.reuse;
+--:-:-:-:0      FMUL R71, R44, R72;
+--:-:-:-:5      CAL TARGET2;
+--:-:-:-:1      IADD R80, R80, 0x3c;
+--:-:-:-:1      IADD R83, R83, 0x3c;
+--:-:-:-:1      IADD R86, R86, 0x3c;
+--:-:-:-:1      IADD R87, R87, 0x3c;
+02:-:-:-:1      IADD R76, R76, R85.reuse;
+--:-:-:-:1      IADD R75, R75, R85.reuse;
+--:-:-:-:1      IADD R79, R79, R85.reuse;
+--:-:-:-:1      IADD R82, R82, R85;
+--:-:-:-:1      FMUL R64, R19, R72.reuse;
+--:-:-:-:1      FMUL R65, R23, R72.reuse;
+--:-:-:-:1      FMUL R66, R17, R72.reuse;
+--:-:-:-:1      FMUL R67, R21, R72.reuse;
+--:-:-:-:1      FMUL R68, R51, R72.reuse;
+--:-:-:-:1      FMUL R69, R55, R72.reuse;
+--:-:-:-:1      FMUL R70, R49, R72.reuse;
+--:-:-:-:0      FMUL R71, R53, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R18, R72.reuse;
+--:-:-:-:1      FMUL R65, R22, R72.reuse;
+--:-:-:-:1      FMUL R66, R16, R72.reuse;
+--:-:-:-:1      FMUL R67, R20, R72.reuse;
+--:-:-:-:1      FMUL R68, R50, R72.reuse;
+--:-:-:-:1      FMUL R69, R54, R72.reuse;
+--:-:-:-:1      FMUL R70, R48, R72.reuse;
+--:-:-:-:0      FMUL R71, R52, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R27, R72.reuse;
+--:-:-:-:1      FMUL R65, R31, R72.reuse;
+--:-:-:-:1      FMUL R66, R25, R72.reuse;
+--:-:-:-:1      FMUL R67, R29, R72.reuse;
+--:-:-:-:1      FMUL R68, R59, R72.reuse;
+--:-:-:-:1      FMUL R69, R63, R72.reuse;
+--:-:-:-:1      FMUL R70, R57, R72.reuse;
+--:-:-:-:0      FMUL R71, R61, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R26, R72.reuse;
+--:-:-:-:1      FMUL R65, R30, R72.reuse;
+--:-:-:-:1      FMUL R66, R24, R72.reuse;
+--:-:-:-:1      FMUL R67, R28, R72.reuse;
+--:-:-:-:1      FMUL R68, R58, R72.reuse;
+--:-:-:-:1      FMUL R69, R62, R72.reuse;
+--:-:-:-:1      FMUL R70, R56, R72.reuse;
+--:-:-:-:0      FMUL R71, R60, R72;
+--:-:-:-:5      CAL TARGET2;
+--:-:-:-:5      EXIT;
+TARGET2:
+--:-:-:-:0      IADD R80, R80, 0x1;
+--:-:-:-:1      STS.128 [R90], R64;
+--:-:-:-:0      IADD R83, R83, 0x1;
+--:-:-:-:1      STS.128 [R90+0x100], R68;
+--:-:-:-:0      IADD R86, R86, 0x1;
+--:-:-:-:1      LDS R64, [R89];
+--:-:-:-:0      IADD R87, R87, 0x1;
+--:-:-:-:1      LDS R65, [R89+0x100];
+--:-:-:-:0      IADD R76, R76, R81.reuse;
+--:-:-:-:1      LDS R66, [R89+0x200];
+--:-:-:-:0      IADD R75, R75, R81.reuse;
+--:-:-:-:1      LDS R67, [R89+0x300];
+--:-:-:-:0      IADD R79, R79, R81.reuse;
+--:-:-:-:1      LDS R68, [R89+0x400];
+--:-:-:-:0      IADD R82, R82, R81;
+--:-:-:-:1      LDS R69, [R89+0x500];
+--:-:-:-:1      ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5;
+--:-:-:-:1      LDS R70, [R89+0x600];
+--:-:-:-:1      ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6;
+--:-:1:-:1      LDS R71, [R89+0x700];
+--:-:-:-:2      ISETP.LT.AND P2, PT, R83.reuse, c[0x0][0x148], P5;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, R83, c[0x0][0x148], P6;
+01:-:-:-:1  @P0 STG.CG [R76], R64;
+--:-:-:-:1      ISETP.LT.AND P0, PT, R86.reuse, c[0x0][0x148], P5;
+--:-:-:-:1  @P1 STG.CG [R76+0x100], R65;
+--:-:-:-:1      ISETP.LT.AND P1, PT, R86, c[0x0][0x148], P6;
+--:-:-:-:1  @P2 STG.CG [R75], R66;
+--:-:-:-:1      ISETP.LT.AND P2, PT, R87.reuse, c[0x0][0x148], P5;
+--:-:-:-:1  @P3 STG.CG [R75+0x100], R67;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, R87, c[0x0][0x148], P6;
+--:-:-:-:2  @P0 STG.CG [R79], R68;
+--:-:-:-:2  @P1 STG.CG [R79+0x100], R69;
+--:-:-:-:2  @P2 STG.CG [R82], R70;
+--:2:-:-:1  @P3 STG.CG [R82+0x100], R71;
+--:-:-:-:5      RET;
diff --git a/Assembler/PascalAs/sgemm/sgemm_final_64.sass b/Assembler/PascalAs/sgemm/sgemm_final_64.sass
new file mode 100644
index 0000000..815ae5d
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm_final_64.sass
@@ -0,0 +1,802 @@
+# Kernel: sgemm_kernel_64
+# Arch: sm_50
+# InsCnt: 779
+# RegCnt: 127
+# SharedSize: 8192
+# BarCnt: 1
+# Params(9):
+#	ord:addr:size:align
+#	0:0x140:4:0
+#	1:0x144:4:0
+#	2:0x148:4:0
+#	3:0x14c:4:0
+#	4:0x150:4:0
+#	5:0x154:4:0
+#	6:0x158:4:0
+#	7:0x15c:4:0
+#	8:0x160:4:0
+#
+# Instructions:
+
+--:-:1:-:1      S2R R119, SR_TID.X;
+--:-:2:-:1      S2R R125, SR_CTAID.X;
+--:-:3:-:1      S2R R122, SR_CTAID.Y;
+01:-:-:-:1      ISETP.GE.AND P0, PT, R119.reuse, 0x20, PT;
+--:-:-:-:1      LOP.AND R9, R119.reuse, 0xf;
+--:-:-:-:1      BFE.U32 R4, R119.reuse, 0x104;
+--:-:-:-:1      MOV R12, c[0x0][0x14c];
+--:-:-:-:1      BFE.U32 R114, R119.reuse, 0x301;
+--:-:-:-:1      LOP.AND R115, R119.reuse, 0x30;
+--:-:-:-:1      LOP.AND R0, R119.reuse, 0x1;
+--:-:-:-:1      SHL R13, R9, 0x4;
+--:-:-:-:1      LOP.AND R80, R119.reuse, -0x20;
+--:-:-:-:1      IADD R12, R12, -0x8;
+--:-:-:-:1      SHL R114, R114, 0x4;
+--:-:-:-:1      LOP.AND R126, R119, 0x1f;
+--:-:-:-:1      SHR.U32 R115, R115, 0x3;
+--:-:-:-:0 @!P0 MOV R2, c[0x0][0x150];
+--:-:-:-:1      STS.128 [R80+0x1000], RZ;
+--:-:-:-:1  @P0 MOV R2, c[0x0][0x154];
+--:-:-:-:1      ISCADD R118, R4, R13, 0x8;
+06:-:-:-:1      SEL R8, R122, R125, P0;
+--:-:-:-:1 @!P0 MOV32I R113, 0x80000001;
+--:-:-:-:1  @P0 MOV32I R113, 0x80000000;
+--:-:-:-:1      LOP.OR R115, R115, R0;
+--:-:-:-:1      SHR.U32 R1, R2.reuse, 0x2;
+--:-:-:-:1      LOP.AND R123, R119, 0x20;
+--:-:-:-:1      ISCADD R112, R8, R9, 0x4;
+--:-:-:-:1      IADD R121, R2, R2;
+--:-:-:-:1  @P0 IADD R118, R118, 0x800;
+--:-:-:-:1      ISCADD R115, R115, 0x800, 0x4;
+--:-:-:-:1      XMAD.MRG R5, R1.reuse, R4.H1.reuse, RZ;
+--:-:-:-:1      XMAD.MRG R16, R12.reuse, R1.H1.reuse, RZ;
+--:-:-:Y:6      XMAD R112, R1.reuse, R4, R112;
+--:-:-:-:2      XMAD.PSL.CBCC R112, R1.H1, R5.H1, R112;
+--:-:1:-:4      TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf;
+--:-:-:-:1      IADD3 R116, R112.reuse, R1.reuse, R1;
+--:-:-:-:1      IADD R120, R112, R2.reuse;
+--:-:2:-:1      TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf;
+--:-:-:-:0      XMAD R117, R12.reuse, R1, R112;
+--:-:3:-:3      TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf;
+--:-:-:-:2      IADD R124, R116, R2;
+--:-:4:-:1      TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf;
+--:-:-:-:1      XMAD.PSL.CBCC R117, R12.H1, R16.H1, R117;
+--:-:5:-:1      LDS.U.128 R0, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R4, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R8, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R12, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R16, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R20, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R24, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R28, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R32, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R36, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R40, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R44, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R48, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R52, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R56, [R80+0x1000];
+--:-:5:-:1      LDS.U.128 R60, [R80+0x1000];
+01:-:-:-:1      STS.128 [R118], R96;
+--:-:-:-:0      IADD R112, R112, R121.reuse;
+02:-:-:-:1      STS.128 [R118+0x200], R100;
+--:-:-:-:0      IADD R116, R116, R121.reuse;
+04:-:-:-:1      STS.128 [R118+0x400], R104;
+--:-:-:-:0      IADD R120, R120, R121.reuse;
+08:-:-:-:1      STS.128 [R118+0x600], R108;
+--:-:-:-:0      IADD R124, R124, R121;
+10:-:-:-:5      BAR.SYNC 0x0;
+--:-:-:-:0      LOP.XOR R118, R118, 0x1000;
+--:-:-:-:1      LDS.U.128 R64, [R114];
+--:-:-:-:1      LDS.U.128 R72, [R115];
+--:-:-:-:1      LDS.U.128 R68, [R114+0x80];
+--:-:1:-:1      LDS.U.128 R76, [R115+0x80];
+TARGET1:
+--:-:-:-:1      ISETP.LE.AND P0, PT, R112, R117, PT;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R114+0x100];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R115+0x100];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R114+0x180];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R115+0x180];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:0      FFMA R11, R64.reuse, R74, R11;
+--:-:-:-:1  @P0 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:0      FFMA R16, R66, R77.reuse, R16;
+--:-:2:-:1  @P0 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R114+0x200];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R115+0x200];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R114+0x280];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R115+0x280];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:0      FFMA R11, R80.reuse, R90, R11;
+--:-:-:-:1  @P0 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:0      FFMA R16, R82, R93.reuse, R16;
+--:-:3:-:1  @P0 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R114+0x300];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R115+0x300];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R114+0x380];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R115+0x380];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R114+0x400];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R115+0x400];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R114+0x480];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R115+0x480];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R114+0x500];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R115+0x500];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R114+0x580];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R115+0x580];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1      LDS.U.128 R64, [R114+0x600];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1      LDS.U.128 R72, [R115+0x600];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1      LDS.U.128 R68, [R114+0x680];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1      LDS.U.128 R76, [R115+0x680];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:0      FFMA R10, R80.reuse, R91, R10;
+02:-:-:-:1  @P0 STS.128 [R118], R96;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:0      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1  @P0 STS.128 [R118+0x200], R100;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
+--:-:-:-:1      LDS.U.128 R80, [R114+0x700];
+--:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
+--:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
+--:-:-:-:1      LDS.U.128 R88, [R115+0x700];
+--:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
+--:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
+--:-:-:-:1      LDS.U.128 R84, [R114+0x780];
+--:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
+--:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
+--:-:1:-:1      LDS.U.128 R92, [R115+0x780];
+--:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
+--:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
+--:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
+--:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
+--:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
+--:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
+--:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
+--:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
+--:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
+--:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
+--:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
+--:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
+--:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
+--:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
+--:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
+--:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
+--:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
+--:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
+--:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
+--:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
+--:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
+--:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
+--:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
+--:-:-:-:0      FFMA R10, R64.reuse, R75, R10;
+04:-:-:-:1  @P0 STS.128 [R118+0x400], R104;
+--:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
+--:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
+--:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
+--:-:-:-:0      FFMA R18, R64.reuse, R77.reuse, R18;
+--:-:-:-:1  @P0 STS.128 [R118+0x600], R108;
+--:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
+--:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
+--:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
+--:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
+--:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
+--:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
+--:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
+--:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
+--:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
+--:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
+--:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
+--:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
+--:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
+--:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
+--:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
+--:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
+--:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
+--:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
+--:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
+--:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
+--:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
+--:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
+--:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
+--:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
+--:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
+--:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
+--:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
+--:-:-:-:0      FFMA R26, R64.reuse, R79, R26;
+01:-:-:-:5      BAR.SYNC 0x0;
+--:-:-:-:1  @P0 LOP.XOR R114, R114, 0x1000;
+--:-:-:-:1  @P0 LOP.XOR R115, R115, 0x1000;
+--:-:-:-:1  @P0 LOP.XOR R118, R118, 0x1000;
+--:-:-:-:1      FFMA R27, R64, R78, R27;
+--:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
+--:-:-:-:1  @P0 LDS.U.128 R64, [R114];
+--:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
+--:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
+--:-:-:-:1  @P0 LDS.U.128 R72, [R115];
+--:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
+--:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
+--:-:-:-:1  @P0 LDS.U.128 R68, [R114+0x80];
+--:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
+--:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
+--:-:1:-:1  @P0 LDS.U.128 R76, [R115+0x80];
+--:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
+--:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
+--:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
+--:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
+--:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
+--:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
+--:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
+--:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
+--:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
+--:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
+--:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
+--:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
+--:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
+--:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
+--:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
+--:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
+--:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
+--:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
+--:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
+--:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
+--:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
+--:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
+--:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
+--:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
+--:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
+--:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
+--:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
+--:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
+--:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
+--:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
+--:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
+--:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
+--:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
+--:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
+--:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
+--:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
+--:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
+--:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
+--:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
+--:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
+--:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
+--:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
+--:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
+--:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
+--:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
+--:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
+--:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
+--:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
+--:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
+--:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
+--:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
+--:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
+--:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
+--:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
+--:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
+--:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
+--:-:-:-:1      FFMA R27, R80, R94, R27;
+--:-:-:-:1  @P0 IADD R112, R112, R121.reuse;
+--:-:-:-:1  @P0 IADD R116, R116, R121.reuse;
+--:-:-:-:1  @P0 IADD R120, R120, R121.reuse;
+--:-:-:-:0  @P0 IADD R124, R124, R121;
+--:-:-:Y:5  @P0 BRA TARGET1;
+--:-:-:-:1      SHR.U32 R80, R123.reuse, 0x1;
+--:-:-:-:1      MOV R81, c[0x0][0x158];
+--:-:-:-:1      ISCADD R84, R125, R126.reuse, 0x6;
+--:-:-:-:1      MOV R72, c[0x0][0x15c];
+--:-:-:-:1      ISCADD R92, R123, R126, 0x3;
+--:-:-:-:1      LOP.AND R114, R114, 0x7ff;
+--:-:-:-:1      ISCADD R80, R122, R80, 0x6;
+--:-:-:-:1      LOP.AND R115, R115, 0x7ff;
+--:-:-:-:1      SHL R77, R81.reuse, 0x2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT;
+--:-:-:-:1      SHL R89, R81.reuse, 0x4;
+--:-:-:-:1      FMUL R64, R3, R72;
+--:-:-:-:1      SHL R91, R81.reuse, 0x5;
+--:-:-:-:1      XMAD.MRG R74, R80.reuse, R81.H1.reuse, RZ;
+--:-:-:-:1      ISCADD R93, R115, R114, 0x4;
+--:-:-:-:1      XMAD R73, R80, R81, R84;
+--:-:-:-:1      SHL R92, R92, 0x2;
+--:-:-:-:1      IADD R84, R84, 0x20;
+--:-:-:-:1      ISCADD R85, R81, -R89, 0x7;
+--:-:-:-:1      FMUL R65, R7, R72.reuse;
+--:-:-:-:1      FMUL R66, R1, R72.reuse;
+--:-:-:-:1      XMAD.PSL.CBCC R73, R80.H1, R74.H1, R73;
+--:-:-:-:1      IADD R80, R80, -0x1;
+--:-:-:-:1      ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT;
+--:-:-:-:1      FMUL R67, R5, R72.reuse;
+--:-:-:-:1      FMUL R68, R35, R72.reuse;
+--:-:-:-:1      FMUL R69, R39, R72.reuse;
+--:-:-:-:1      ISCADD R76, R73, c[0x0][0x140], 0x2;
+--:-:-:-:1      IADD R86, R80.reuse, 0x4;
+--:-:-:-:1      IADD R87, R80.reuse, 0x8;
+--:-:-:-:1      IADD R88, R80, 0xc;
+--:-:-:-:1      FMUL R70, R33, R72.reuse;
+--:-:-:-:1      FMUL R71, R37, R72;
+--:-:-:Y:6      IADD R76, R76, -R77;
+--:-:-:-:1      IADD R75, R76.reuse, R89;
+--:-:-:Y:5      IADD R78, R76, R91.reuse;
+--:-:-:-:0      IADD R79, R75, R91;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R2, R72.reuse;
+--:-:-:-:1      FMUL R65, R6, R72.reuse;
+--:-:-:-:1      FMUL R66, R0, R72.reuse;
+--:-:-:-:1      FMUL R67, R4, R72.reuse;
+--:-:-:-:1      FMUL R68, R34, R72.reuse;
+--:-:-:-:1      FMUL R69, R38, R72.reuse;
+--:-:-:-:1      FMUL R70, R32, R72.reuse;
+--:-:-:-:0      FMUL R71, R36, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R11, R72.reuse;
+--:-:-:-:1      FMUL R65, R15, R72.reuse;
+--:-:-:-:1      FMUL R66, R9, R72.reuse;
+--:-:-:-:1      FMUL R67, R13, R72.reuse;
+--:-:-:-:1      FMUL R68, R43, R72.reuse;
+--:-:-:-:1      FMUL R69, R47, R72.reuse;
+--:-:-:-:1      FMUL R70, R41, R72.reuse;
+--:-:-:-:0      FMUL R71, R45, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R10, R72.reuse;
+--:-:-:-:1      FMUL R65, R14, R72.reuse;
+--:-:-:-:1      FMUL R66, R8, R72.reuse;
+--:-:-:-:1      FMUL R67, R12, R72.reuse;
+--:-:-:-:1      FMUL R68, R42, R72.reuse;
+--:-:-:-:1      FMUL R69, R46, R72.reuse;
+--:-:-:-:1      FMUL R70, R40, R72.reuse;
+--:-:-:-:0      FMUL R71, R44, R72;
+--:-:-:-:5      CAL TARGET2;
+--:-:-:-:1      IADD R80, R80, 0x1c;
+--:-:-:-:1      IADD R86, R86, 0x1c;
+--:-:-:-:1      IADD R87, R87, 0x1c;
+--:-:-:-:1      IADD R88, R88, 0x1c;
+02:-:-:-:1      IADD R76, R76, R85.reuse;
+--:-:-:-:1      IADD R75, R75, R85.reuse;
+--:-:-:-:1      IADD R78, R78, R85.reuse;
+--:-:-:-:1      IADD R79, R79, R85;
+--:-:-:-:1      FMUL R64, R19, R72.reuse;
+--:-:-:-:1      FMUL R65, R23, R72.reuse;
+--:-:-:-:1      FMUL R66, R17, R72.reuse;
+--:-:-:-:1      FMUL R67, R21, R72.reuse;
+--:-:-:-:1      FMUL R68, R51, R72.reuse;
+--:-:-:-:1      FMUL R69, R55, R72.reuse;
+--:-:-:-:1      FMUL R70, R49, R72.reuse;
+--:-:-:-:0      FMUL R71, R53, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R18, R72.reuse;
+--:-:-:-:1      FMUL R65, R22, R72.reuse;
+--:-:-:-:1      FMUL R66, R16, R72.reuse;
+--:-:-:-:1      FMUL R67, R20, R72.reuse;
+--:-:-:-:1      FMUL R68, R50, R72.reuse;
+--:-:-:-:1      FMUL R69, R54, R72.reuse;
+--:-:-:-:1      FMUL R70, R48, R72.reuse;
+--:-:-:-:0      FMUL R71, R52, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R27, R72.reuse;
+--:-:-:-:1      FMUL R65, R31, R72.reuse;
+--:-:-:-:1      FMUL R66, R25, R72.reuse;
+--:-:-:-:1      FMUL R67, R29, R72.reuse;
+--:-:-:-:1      FMUL R68, R59, R72.reuse;
+--:-:-:-:1      FMUL R69, R63, R72.reuse;
+--:-:-:-:1      FMUL R70, R57, R72.reuse;
+--:-:-:-:0      FMUL R71, R61, R72;
+--:-:-:-:5      CAL TARGET2;
+02:-:-:-:1      FMUL R64, R26, R72.reuse;
+--:-:-:-:1      FMUL R65, R30, R72.reuse;
+--:-:-:-:1      FMUL R66, R24, R72.reuse;
+--:-:-:-:1      FMUL R67, R28, R72.reuse;
+--:-:-:-:1      FMUL R68, R58, R72.reuse;
+--:-:-:-:1      FMUL R69, R62, R72.reuse;
+--:-:-:-:1      FMUL R70, R56, R72.reuse;
+--:-:-:-:0      FMUL R71, R60, R72;
+--:-:-:-:5      CAL TARGET2;
+--:-:-:-:5      EXIT;
+TARGET2:
+--:-:-:-:0      IADD R80, R80, 0x1;
+--:-:-:-:1      STS.128 [R93], R64;
+--:-:-:-:0      IADD R86, R86, 0x1;
+--:-:-:-:1      STS.128 [R93+0x80], R68;
+--:-:-:-:0      IADD R87, R87, 0x1;
+--:-:-:-:1      LDS R64, [R92];
+--:-:-:-:0      IADD R88, R88, 0x1;
+--:-:-:-:1      LDS R65, [R92+0x80];
+--:-:-:-:0      IADD R76, R76, R77.reuse;
+--:-:-:-:1      LDS R66, [R92+0x100];
+--:-:-:-:0      IADD R75, R75, R77.reuse;
+--:-:-:-:1      LDS R67, [R92+0x180];
+--:-:-:-:0      IADD R78, R78, R77.reuse;
+--:-:-:-:1      LDS R68, [R92+0x200];
+--:-:-:-:0      IADD R79, R79, R77;
+--:-:-:-:1      LDS R69, [R92+0x280];
+--:-:-:-:1      ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5;
+--:-:-:-:1      LDS R70, [R92+0x300];
+--:-:-:-:1      ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6;
+--:-:1:-:1      LDS R71, [R92+0x380];
+--:-:-:-:2      ISETP.LT.AND P2, PT, R86.reuse, c[0x0][0x148], P5;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, R86, c[0x0][0x148], P6;
+01:-:-:-:1  @P0 STG.CG [R76], R64;
+--:-:-:-:1      ISETP.LT.AND P0, PT, R87.reuse, c[0x0][0x148], P5;
+--:-:-:-:1  @P1 STG.CG [R76+0x80], R65;
+--:-:-:-:1      ISETP.LT.AND P1, PT, R87, c[0x0][0x148], P6;
+--:-:-:-:1  @P2 STG.CG [R75], R66;
+--:-:-:-:1      ISETP.LT.AND P2, PT, R88.reuse, c[0x0][0x148], P5;
+--:-:-:-:1  @P3 STG.CG [R75+0x80], R67;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, R88, c[0x0][0x148], P6;
+--:-:-:-:2  @P0 STG.CG [R78], R68;
+--:-:-:-:2  @P1 STG.CG [R78+0x80], R69;
+--:-:-:-:2  @P2 STG.CG [R79], R70;
+--:2:-:-:1  @P3 STG.CG [R79+0x80], R71;
+--:-:-:-:5      RET;
diff --git a/Assembler/PascalAs/sgemm/sgemm_pre_128.sass b/Assembler/PascalAs/sgemm/sgemm_pre_128.sass
new file mode 100644
index 0000000..cde320e
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm_pre_128.sass
@@ -0,0 +1,924 @@
+# Kernel: sgemm_kernel_128
+#
+# SharedSize: 16384
+# Params(8):
+#   0:0x140:4:4 param_C,
+#   1:0x144:4:0 param_m,
+#   2:0x148:4:0 param_n,
+#   3:0x14c:4:0 param_k,
+#   4:0x150:4:0 param_lda,
+#   5:0x154:4:0 param_ldb,
+#   6:0x158:4:0 param_ldc
+#   7:0x15c:4:0 param_alpha
+#   8:0x160:4:4 param_D // for diagnostic printf output
+#
+# Globals:
+#   c[0x0][0x164]: texA (the value is 1)
+#   c[0x0][0x168]: texB (the value is 0)
+
+<REGISTER_MAPPING>
+
+    // Temporary registers to calculate the state registers. Reuse the C output registers.
+    // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts.
+    0-63    ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy
+
+    // Aliases for the C registers we use for initializing C (used as vectors)
+    0-63    : cz<00-63>
+
+    // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers
+    80      : zOffset
+
+    // 64 C maxtrix output registers.
+    // Use special mapping to avoid register bank conflicts between these registers and the blocking registers.
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
+    35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
+    39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
+    33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
+    37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
+
+    // Double buffered register blocking used in vector loads.
+    // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags
+    64-79   : j0Ax<00-03|64-67>, j0By<00-03|64-67>
+    80-95   : j1Ax<00-03|64-67>, j1By<00-03|64-67>
+
+    // Registers to load A or B
+    96-103  : loadX<0-7>
+
+    // Key global state registers for main loop and some we reuse for outputing C.
+    // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of
+    // delayed bank conflicts between memory operations and ffmas.
+    // The array index bracket notation can be used to request a bank in a dynamically allocated range.
+    104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs
+
+    // Registers to store the results back to global memory. Reuse any register not needed after the main loop.
+    // Statically allocate cs0-7 because they're vector registers.
+    64-71   : cs<0-7>
+
+    // dynamically allocated C output registers(~)
+    72-103  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX
+
+</REGISTER_MAPPING>
+
+// Note the absense of the loading of the stack pointer into R1.
+// No idea why ptxas does that anyway when it's not used for register spilling.
+// Such a waste of a perfectly good register.
+
+// Scheduler doesn't handle the dependency flags yet,
+// so move these first instructions outside the block that's auto scheduled
+//--:-:-:-:1      CS2R clock, SR_CLOCKLO;
+//--:-:-:-:1      S2R smId, SR_VIRTID;
+//--:-:-:-:1      S2R nSMs, SR_VIRTCFG;
+--:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
+--:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
+--:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
+
+// Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies
+// Memory dependencies are left up to the auther to deal with manually for now.
+01:-:-:Y:1      ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      BFE.U32 tid4, tid, 0x205; // 2 bits at position 5
+--:-:-:-:1      MOV k, c[0x0][0x14c];
+--:-:-:-:1      BFE.U32 tid7, tid, 0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.AND tid128, tid, 128;
+--:-:-:-:1      LOP.AND readBs, tid, 0x70;
+--:-:-:-:1      SHL tid31_4, tid31, 4;
+--:-:-:-:1      LOP.AND tid1, tid, 1;
+--:-:-:-:1      IADD k, k, -8;
+--:-:-:-:1      LOP.AND zOffset, tid, -32;
+--:-:-:-:1      SHR.U32 readAs, tid128, 4;
+--:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:0 @!P0 MOV ldx4, c[0x0][0x150];
+--:-:-:-:1      STS.128 [zOffset + 4x<16*128>], RZ;
+--:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
+--:-:-:-:1      ISCADD writeS, tid4, tid31_4, 9;
+06:-:-:-:1      SEL blk, by, bx, P0;               // Wait Dep 2 & 3
+--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
+--:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
+--:-:-:-:1      LOP.OR readAs, readAs, tid7;
+--:-:-:-:1      SHR.U32 ldx, ldx4, 2;
+--:-:-:-:1      LOP.OR readBs, readBs, tid1;
+--:-:-:-:1      ISCADD track0, blk, tid31, 5;
+--:-:-:-:1      IADD ldx8, ldx4, ldx4;
+--:-:-:-:1  @P0 IADD writeS, writeS, 4x<8*128>;
+--:-:-:-:1      SHL readAs, readAs, 4;
+--:-:-:-:1      XMAD.MRG xmad_t0, ldx, tid4.H1, RZ; // XMAD.LO is a macro that is expanded out into the 3 XMADs
+--:-:-:-:1      ISCADD readBs, readBs, 4x<8*128>, 4;
+--:-:-:-:1      XMAD track0, ldx, tid4, track0;
+--:-:-:Y:5      XMAD.MRG xmad_end, k, ldx.H1, RZ;
+--:-:-:-:2      XMAD.PSL.CBCC track0, ldx.H1, xmad_t0.H1, track0;
+--:-:1:-:4      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:-:-:1      IADD track4, track0, ldx4;
+--:-:-:-:1      XMAD end, k, ldx, track0;
+--:-:2:Y:5      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:-:1      XMAD.PSL.CBCC end, k.H1, xmad_end.H1, end;
+
+// Initialize C registeres to zero
+// Using LDS.U.128 is a neat trick to save a few clock cyles
+// (when you have enough warps to hide the latency.)
+--:-:3:-:1      LDS.U.128 cz00, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz04, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz08, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz12, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz16, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz20, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz24, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz28, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz32, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz36, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz40, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz44, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz48, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz52, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz56, [zOffset + 4x<16*128>];
+--:-:3:-:1      LDS.U.128 cz60, [zOffset + 4x<16*128>];
+
+// These instuctions need to occur after the textures load so put them in a new block
+// that starts with a dependency barrier wait.
+01:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1
+--:-:-:-:0      IADD track0, track0, ldx8;
+02:-:-:-:1      STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2
+--:-:-:-:0      IADD track4, track4, ldx8;
+04:-:-:-:5      BAR.SYNC 0;
+
+// The next store to shared goes to high area.
+// Having 2 share buffers allows us to eliminate a bar.sync in the main loop.
+// This way we don't have to wait for all threads to arrive before writing fresh data to shared.
+// Other threads can continue reading from the last batch while the new data is being written.
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*128>;
+
+// Preload the fist lines of A and B from shared
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1
+
+
+// The main loop
+// While calculating the first line, load in the next line from shared.
+// Shared memory stores enough to do this 8 times per loop.
+// Also pull in the next block of memory from global and store it to shared.
+
+// Efficiency:
+// ffma: 512
+// lds:  32 dual issued
+// sts:  2  dual issued
+// tex:  2  dual issued
+// add:  2
+// xor:  3
+// setp: 1
+// bar:  1  dual issued
+// bra:  1  dual issued
+// Total: 524 (512/518 = 98.8% FFMA)
+
+// Memory Throughput Upper Bound:
+// 2 * 4 * 4 bytes per thread per 518 clocks
+// 128 threads per SM
+// 16 SM's (GM204)
+// 1640Mhz (boost overclock)
+// .931 GiB/GB  (1000^3 / 1024^3)
+// 193 GiB/sec
+// Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz)
+
+LOOP:
+
+// Loop end condition
+--:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
+
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<1*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<1*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax64, [readAs + 4x<1*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By64, [readBs + 4x<1*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j0Ax66, j0By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j0Ax66, j0By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j0Ax64, j0By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j0Ax64, j0By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j0Ax67, j0By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j0Ax67, j0By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j0Ax65, j0By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j0Ax65, j0By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j0Ax67, j0By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j0Ax67, j0By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j0Ax65, j0By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j0Ax65, j0By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j0Ax66, j0By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j0Ax66, j0By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j0Ax64, j0By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j0Ax64, j0By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:0      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:2:-:1  @P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:Y:1      FFMA cx02y64, j0Ax02, j0By64, cx02y64;
+--:-:-:-:0      FFMA cx02y65, j0Ax02, j0By65, cx02y65;
+--:-:3:-:1  @P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3
+--:-:-:-:1      FFMA cx00y65, j0Ax00, j0By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j0Ax00, j0By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j0Ax03, j0By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j0Ax03, j0By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j0Ax01, j0By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j0Ax01, j0By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j0Ax66, j0By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j0Ax66, j0By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j0Ax64, j0By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j0Ax64, j0By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j0Ax67, j0By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j0Ax67, j0By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j0Ax65, j0By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j0Ax65, j0By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j0Ax67, j0By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j0Ax67, j0By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j0Ax65, j0By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j0Ax65, j0By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j0Ax66, j0By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j0Ax66, j0By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j0Ax64, j0By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j0Ax64, j0By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j0Ax03, j0By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j0Ax03, j0By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j0Ax01, j0By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j0Ax01, j0By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j0Ax02, j0By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j0Ax02, j0By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j0Ax00, j0By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j0Ax00, j0By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<2*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<2*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<2*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<2*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j1Ax66, j1By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j1Ax66, j1By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j1Ax64, j1By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j1Ax64, j1By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j1Ax67, j1By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j1Ax67, j1By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j1Ax65, j1By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j1Ax65, j1By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j1Ax67, j1By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j1Ax67, j1By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j1Ax65, j1By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j1Ax65, j1By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j1Ax66, j1By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j1Ax66, j1By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j1Ax64, j1By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j1Ax64, j1By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j1Ax02, j1By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j1Ax02, j1By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j1Ax00, j1By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j1Ax00, j1By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j1Ax03, j1By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j1Ax03, j1By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j1Ax01, j1By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j1Ax01, j1By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j1Ax66, j1By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j1Ax66, j1By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j1Ax64, j1By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j1Ax64, j1By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j1Ax67, j1By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j1Ax67, j1By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j1Ax65, j1By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j1Ax65, j1By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j1Ax67, j1By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j1Ax67, j1By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j1Ax65, j1By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j1Ax65, j1By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j1Ax66, j1By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j1Ax66, j1By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j1Ax64, j1By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j1Ax64, j1By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j1Ax03, j1By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j1Ax03, j1By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j1Ax01, j1By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j1Ax01, j1By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j1Ax02, j1By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j1Ax02, j1By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j1Ax00, j1By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j1Ax00, j1By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<3*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<3*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax64, [readAs + 4x<3*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By64, [readBs + 4x<3*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j0Ax66, j0By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j0Ax66, j0By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j0Ax64, j0By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j0Ax64, j0By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j0Ax67, j0By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j0Ax67, j0By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j0Ax65, j0By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j0Ax65, j0By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j0Ax67, j0By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j0Ax67, j0By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j0Ax65, j0By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j0Ax65, j0By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j0Ax66, j0By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j0Ax66, j0By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j0Ax64, j0By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j0Ax64, j0By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j0Ax02, j0By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j0Ax02, j0By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j0Ax00, j0By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j0Ax00, j0By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j0Ax03, j0By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j0Ax03, j0By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j0Ax01, j0By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j0Ax01, j0By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j0Ax66, j0By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j0Ax66, j0By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j0Ax64, j0By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j0Ax64, j0By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j0Ax67, j0By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j0Ax67, j0By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j0Ax65, j0By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j0Ax65, j0By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j0Ax67, j0By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j0Ax67, j0By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j0Ax65, j0By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j0Ax65, j0By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j0Ax66, j0By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j0Ax66, j0By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j0Ax64, j0By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j0Ax64, j0By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j0Ax03, j0By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j0Ax03, j0By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j0Ax01, j0By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j0Ax01, j0By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j0Ax02, j0By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j0Ax02, j0By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j0Ax00, j0By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j0Ax00, j0By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<4*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<4*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<4*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<4*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j1Ax66, j1By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j1Ax66, j1By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j1Ax64, j1By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j1Ax64, j1By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j1Ax67, j1By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j1Ax67, j1By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j1Ax65, j1By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j1Ax65, j1By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j1Ax67, j1By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j1Ax67, j1By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j1Ax65, j1By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j1Ax65, j1By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j1Ax66, j1By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j1Ax66, j1By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j1Ax64, j1By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j1Ax64, j1By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j1Ax02, j1By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j1Ax02, j1By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j1Ax00, j1By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j1Ax00, j1By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j1Ax03, j1By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j1Ax03, j1By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j1Ax01, j1By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j1Ax01, j1By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j1Ax66, j1By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j1Ax66, j1By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j1Ax64, j1By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j1Ax64, j1By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j1Ax67, j1By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j1Ax67, j1By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j1Ax65, j1By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j1Ax65, j1By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j1Ax67, j1By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j1Ax67, j1By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j1Ax65, j1By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j1Ax65, j1By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j1Ax66, j1By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j1Ax66, j1By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j1Ax64, j1By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j1Ax64, j1By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j1Ax03, j1By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j1Ax03, j1By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j1Ax01, j1By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j1Ax01, j1By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j1Ax02, j1By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j1Ax02, j1By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j1Ax00, j1By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j1Ax00, j1By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<5*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<5*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax64, [readAs + 4x<5*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By64, [readBs + 4x<5*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j0Ax66, j0By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j0Ax66, j0By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j0Ax64, j0By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j0Ax64, j0By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j0Ax67, j0By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j0Ax67, j0By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j0Ax65, j0By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j0Ax65, j0By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j0Ax67, j0By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j0Ax67, j0By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j0Ax65, j0By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j0Ax65, j0By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j0Ax66, j0By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j0Ax66, j0By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j0Ax64, j0By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j0Ax64, j0By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j0Ax02, j0By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j0Ax02, j0By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j0Ax00, j0By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j0Ax00, j0By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j0Ax03, j0By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j0Ax03, j0By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j0Ax01, j0By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j0Ax01, j0By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j0Ax66, j0By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j0Ax66, j0By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j0Ax64, j0By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j0Ax64, j0By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j0Ax67, j0By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j0Ax67, j0By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j0Ax65, j0By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j0Ax65, j0By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j0Ax67, j0By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j0Ax67, j0By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j0Ax65, j0By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j0Ax65, j0By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j0Ax66, j0By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j0Ax66, j0By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j0Ax64, j0By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j0Ax64, j0By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j0Ax03, j0By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j0Ax03, j0By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j0Ax01, j0By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j0Ax01, j0By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j0Ax02, j0By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j0Ax02, j0By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j0Ax00, j0By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j0Ax00, j0By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<6*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<6*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<6*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<6*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j1Ax66, j1By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j1Ax66, j1By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j1Ax64, j1By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j1Ax64, j1By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j1Ax67, j1By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j1Ax67, j1By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j1Ax65, j1By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j1Ax65, j1By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j1Ax67, j1By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j1Ax67, j1By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j1Ax65, j1By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j1Ax65, j1By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j1Ax66, j1By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j1Ax66, j1By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j1Ax64, j1By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j1Ax64, j1By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j1Ax02, j1By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j1Ax02, j1By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j1Ax00, j1By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j1Ax00, j1By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j1Ax03, j1By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j1Ax03, j1By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j1Ax01, j1By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j1Ax01, j1By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j1Ax66, j1By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j1Ax66, j1By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j1Ax64, j1By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j1Ax64, j1By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j1Ax67, j1By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j1Ax67, j1By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j1Ax65, j1By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j1Ax65, j1By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j1Ax67, j1By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j1Ax67, j1By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j1Ax65, j1By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j1Ax65, j1By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j1Ax66, j1By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j1Ax66, j1By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j1Ax64, j1By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j1Ax64, j1By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j1Ax03, j1By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j1Ax03, j1By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j1Ax01, j1By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j1Ax01, j1By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j1Ax02, j1By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j1Ax02, j1By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j1Ax00, j1By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j1Ax00, j1By66, cx00y66;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<7*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<7*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax64, [readAs + 4x<7*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By64, [readBs + 4x<7*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j0Ax66, j0By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j0Ax66, j0By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j0Ax64, j0By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j0Ax64, j0By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j0Ax67, j0By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j0Ax67, j0By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j0Ax65, j0By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j0Ax65, j0By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j0Ax67, j0By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j0Ax67, j0By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j0Ax65, j0By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j0Ax65, j0By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j0Ax66, j0By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j0Ax66, j0By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j0Ax64, j0By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j0Ax64, j0By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:0      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+02:-:-:-:1  @P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j0Ax02, j0By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j0Ax02, j0By65, cx02y65;
+--:-:-:-:0      FFMA cx00y65, j0Ax00, j0By65, cx00y65;
+04:-:-:-:1  @P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3
+--:-:-:-:1      FFMA cx00y64, j0Ax00, j0By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j0Ax03, j0By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j0Ax03, j0By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j0Ax01, j0By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j0Ax01, j0By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j0Ax66, j0By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j0Ax66, j0By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j0Ax64, j0By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j0Ax64, j0By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j0Ax67, j0By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j0Ax67, j0By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j0Ax65, j0By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j0Ax65, j0By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j0Ax67, j0By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j0Ax67, j0By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j0Ax65, j0By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j0Ax65, j0By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j0Ax66, j0By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j0Ax66, j0By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j0Ax64, j0By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j0Ax64, j0By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j0Ax03, j0By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j0Ax03, j0By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j0Ax01, j0By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j0Ax01, j0By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j0Ax02, j0By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j0Ax02, j0By67, cx02y67;
+--:-:-:-:0      FFMA cx00y67, j0Ax00, j0By67, cx00y67;
+01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1
+--:-:-:-:1  @P0 LOP.XOR readAs, readAs, 4x<16*128>;
+--:-:-:-:1  @P0 LOP.XOR readBs, readBs, 4x<16*128>;
+--:-:-:-:1  @P0 LOP.XOR writeS, writeS, 4x<16*128>;
+--:-:-:-:1      FFMA cx00y66, j0Ax00, j0By66, cx00y66;
+--:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00;
+--:-:-:-:1  @P0 LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1  @P0 LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1  @P0 LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1  @P0 LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx66y00, j1Ax66, j1By00, cx66y00;
+--:-:-:-:1      FFMA cx66y01, j1Ax66, j1By01, cx66y01;
+--:-:-:-:1      FFMA cx64y01, j1Ax64, j1By01, cx64y01;
+--:-:-:-:1      FFMA cx64y00, j1Ax64, j1By00, cx64y00;
+--:-:-:-:1      FFMA cx67y00, j1Ax67, j1By00, cx67y00;
+--:-:-:-:1      FFMA cx67y01, j1Ax67, j1By01, cx67y01;
+--:-:-:-:1      FFMA cx65y01, j1Ax65, j1By01, cx65y01;
+--:-:-:-:1      FFMA cx65y00, j1Ax65, j1By00, cx65y00;
+--:-:-:-:1      FFMA cx67y02, j1Ax67, j1By02, cx67y02;
+--:-:-:-:1      FFMA cx67y03, j1Ax67, j1By03, cx67y03;
+--:-:-:-:1      FFMA cx65y03, j1Ax65, j1By03, cx65y03;
+--:-:-:-:1      FFMA cx65y02, j1Ax65, j1By02, cx65y02;
+--:-:-:-:1      FFMA cx66y02, j1Ax66, j1By02, cx66y02;
+--:-:-:-:1      FFMA cx66y03, j1Ax66, j1By03, cx66y03;
+--:-:-:-:1      FFMA cx64y03, j1Ax64, j1By03, cx64y03;
+--:-:-:-:1      FFMA cx64y02, j1Ax64, j1By02, cx64y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y64, j1Ax02, j1By64, cx02y64;
+--:-:-:-:1      FFMA cx02y65, j1Ax02, j1By65, cx02y65;
+--:-:-:-:1      FFMA cx00y65, j1Ax00, j1By65, cx00y65;
+--:-:-:-:1      FFMA cx00y64, j1Ax00, j1By64, cx00y64;
+--:-:-:-:1      FFMA cx03y64, j1Ax03, j1By64, cx03y64;
+--:-:-:-:1      FFMA cx03y65, j1Ax03, j1By65, cx03y65;
+--:-:-:-:1      FFMA cx01y65, j1Ax01, j1By65, cx01y65;
+--:-:-:-:1      FFMA cx01y64, j1Ax01, j1By64, cx01y64;
+--:-:-:-:1      FFMA cx66y64, j1Ax66, j1By64, cx66y64;
+--:-:-:-:1      FFMA cx66y65, j1Ax66, j1By65, cx66y65;
+--:-:-:-:1      FFMA cx64y65, j1Ax64, j1By65, cx64y65;
+--:-:-:-:1      FFMA cx64y64, j1Ax64, j1By64, cx64y64;
+--:-:-:-:1      FFMA cx67y64, j1Ax67, j1By64, cx67y64;
+--:-:-:-:1      FFMA cx67y65, j1Ax67, j1By65, cx67y65;
+--:-:-:-:1      FFMA cx65y65, j1Ax65, j1By65, cx65y65;
+--:-:-:-:1      FFMA cx65y64, j1Ax65, j1By64, cx65y64;
+--:-:-:-:1      FFMA cx67y66, j1Ax67, j1By66, cx67y66;
+--:-:-:-:1      FFMA cx67y67, j1Ax67, j1By67, cx67y67;
+--:-:-:-:1      FFMA cx65y67, j1Ax65, j1By67, cx65y67;
+--:-:-:-:1      FFMA cx65y66, j1Ax65, j1By66, cx65y66;
+--:-:-:-:1      FFMA cx66y66, j1Ax66, j1By66, cx66y66;
+--:-:-:-:1      FFMA cx66y67, j1Ax66, j1By67, cx66y67;
+--:-:-:-:1      FFMA cx64y67, j1Ax64, j1By67, cx64y67;
+--:-:-:-:1      FFMA cx64y66, j1Ax64, j1By66, cx64y66;
+--:-:-:-:1      FFMA cx03y66, j1Ax03, j1By66, cx03y66;
+--:-:-:-:1      FFMA cx03y67, j1Ax03, j1By67, cx03y67;
+--:-:-:-:1      FFMA cx01y67, j1Ax01, j1By67, cx01y67;
+--:-:-:-:1      FFMA cx01y66, j1Ax01, j1By66, cx01y66;
+--:-:-:-:1      FFMA cx02y66, j1Ax02, j1By66, cx02y66;
+--:-:-:-:1      FFMA cx02y67, j1Ax02, j1By67, cx02y67;
+--:-:-:-:1      FFMA cx00y67, j1Ax00, j1By67, cx00y67;
+--:-:-:-:1      FFMA cx00y66, j1Ax00, j1By66, cx00y66;
+--:-:-:-:1  @P0 IADD track0, track0, ldx8;
+--:-:-:-:0  @P0 IADD track4, track4, ldx8;
+--:-:-:Y:5  @P0 BRA LOOP;
+
+// Main loop is done, time to write C to global memory.
+--:-:-:-:1      SHR.U32 cx, tid128, 2;
+--:-:-:-:1      MOV ldc, c[0x0][0x158];
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      MOV alpha, c[0x0][0x15c];
+--:-:-:-:1      SHL readCs, tid96, 4;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
+--:-:-:-:1      LOP.OR cx, tid31, cx;
+--:-:-:-:1      SHL ldc1, ldc, 2;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
+--:-:-:-:1      ISCADD cy00, by, cy00, 7;
+--:-:-:-:1      FMUL cs0, cx00y00, alpha;
+--:-:-:-:1      SHL ldc4, ldc, 4;
+--:-:-:-:1      LOP.OR readCs, readCs, cx;
+--:-:-:-:1      ISCADD cx, bx, cx, 7;
+--:-:-:-:1      FMUL cs1, cx01y00, alpha;
+--:-:-:-:1      SHL ldc8, ldc, 5;
+--:-:-:-:1      XMAD.MRG xmad_ci, cy00, ldc.H1, RZ;
+--:-:-:-:1      ISCADD writeCs, readBs, readAs, 5;
+--:-:-:-:1      FMUL cs2, cx02y00, alpha;
+--:-:-:-:1      SHL readCs, readCs, 2;
+--:-:-:-:1      XMAD ci, cy00, ldc, cx;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
+--:-:-:-:1      IADD cx, cx, 64;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+--:-:-:-:1      FMUL cs3, cx03y00, alpha;
+--:-:-:-:1      FMUL cs4, cx64y00, alpha;
+--:-:-:-:1      XMAD.PSL.CBCC ci, cy00.H1, xmad_ci.H1, ci;
+--:-:-:-:1      IADD cy00, cy00, -1;
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
+--:-:-:-:1      FMUL cs5, cx65y00, alpha;
+--:-:-:-:1      FMUL cs6, cx66y00, alpha;
+--:-:-:-:1      FMUL cs7, cx67y00, alpha;
+--:-:-:-:1      ISCADD Cy00, ci, c[0x0][0x140], 2;
+--:-:-:-:1      IADD cy04, cy00, 4;
+--:-:-:-:1      IADD cy08, cy00, 8;
+--:-:-:-:3      IADD cy12, cy00, 12;
+--:-:-:Y:6      IADD Cy00, Cy00, -ldc1;
+--:-:-:-:1      IADD Cy04, Cy00, ldc4;
+--:-:-:Y:5      IADD Cy08, Cy00, ldc8;
+--:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
+
+// There's nothing yet in place to handle dependecies with subroutines.
+// So don't schedule this block.
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y01, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y01, alpha;
+--:-:-:-:1      FMUL cs2, cx02y01, alpha;
+--:-:-:-:1      FMUL cs3, cx03y01, alpha;
+--:-:-:-:1      FMUL cs4, cx64y01, alpha;
+--:-:-:-:1      FMUL cs5, cx65y01, alpha;
+--:-:-:-:1      FMUL cs6, cx66y01, alpha;
+--:-:-:-:0      FMUL cs7, cx67y01, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y02, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y02, alpha;
+--:-:-:-:1      FMUL cs2, cx02y02, alpha;
+--:-:-:-:1      FMUL cs3, cx03y02, alpha;
+--:-:-:-:1      FMUL cs4, cx64y02, alpha;
+--:-:-:-:1      FMUL cs5, cx65y02, alpha;
+--:-:-:-:1      FMUL cs6, cx66y02, alpha;
+--:-:-:-:0      FMUL cs7, cx67y02, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y03, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y03, alpha;
+--:-:-:-:1      FMUL cs2, cx02y03, alpha;
+--:-:-:-:1      FMUL cs3, cx03y03, alpha;
+--:-:-:-:1      FMUL cs4, cx64y03, alpha;
+--:-:-:-:1      FMUL cs5, cx65y03, alpha;
+--:-:-:-:1      FMUL cs6, cx66y03, alpha;
+--:-:-:-:0      FMUL cs7, cx67y03, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      IADD cy00, cy00, 60;
+--:-:-:-:1      IADD cy04, cy04, 60;
+--:-:-:-:1      IADD cy08, cy08, 60;
+--:-:-:-:1      IADD cy12, cy12, 60;
+
+02:-:-:-:1      IADD Cy00, Cy00, ldc60; // Wait Dep 2
+--:-:-:-:1      IADD Cy04, Cy04, ldc60;
+--:-:-:-:1      IADD Cy08, Cy08, ldc60;
+--:-:-:-:1      IADD Cy12, Cy12, ldc60;
+
+--:-:-:-:1      FMUL cs0, cx00y64, alpha;
+--:-:-:-:1      FMUL cs1, cx01y64, alpha;
+--:-:-:-:1      FMUL cs2, cx02y64, alpha;
+--:-:-:-:1      FMUL cs3, cx03y64, alpha;
+--:-:-:-:1      FMUL cs4, cx64y64, alpha;
+--:-:-:-:1      FMUL cs5, cx65y64, alpha;
+--:-:-:-:1      FMUL cs6, cx66y64, alpha;
+--:-:-:-:0      FMUL cs7, cx67y64, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y65, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y65, alpha;
+--:-:-:-:1      FMUL cs2, cx02y65, alpha;
+--:-:-:-:1      FMUL cs3, cx03y65, alpha;
+--:-:-:-:1      FMUL cs4, cx64y65, alpha;
+--:-:-:-:1      FMUL cs5, cx65y65, alpha;
+--:-:-:-:1      FMUL cs6, cx66y65, alpha;
+--:-:-:-:0      FMUL cs7, cx67y65, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y66, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y66, alpha;
+--:-:-:-:1      FMUL cs2, cx02y66, alpha;
+--:-:-:-:1      FMUL cs3, cx03y66, alpha;
+--:-:-:-:1      FMUL cs4, cx64y66, alpha;
+--:-:-:-:1      FMUL cs5, cx65y66, alpha;
+--:-:-:-:1      FMUL cs6, cx66y66, alpha;
+--:-:-:-:0      FMUL cs7, cx67y66, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y67, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y67, alpha;
+--:-:-:-:1      FMUL cs2, cx02y67, alpha;
+--:-:-:-:1      FMUL cs3, cx03y67, alpha;
+--:-:-:-:1      FMUL cs4, cx64y67, alpha;
+--:-:-:-:1      FMUL cs5, cx65y67, alpha;
+--:-:-:-:1      FMUL cs6, cx66y67, alpha;
+--:-:-:-:0      FMUL cs7, cx67y67, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+
+// And we'd done.  The remainder is the STORE_C subroutine that's defined at the end of the kernel.
+--:-:-:-:5      EXIT;
+
+// This routine does warp synchronous shuffling of our output data so as to be able
+// to have coalesced writes to global memory.  This is actually faster because the shared
+// memory latencies can be hidden by other warps and we're only adding a few extra clocks
+// to this thread.  Global memory here is the bottleneck and being able to half the needed
+// bandwidth at the expense of a few clocks is a modest win.  This also keeps power lower
+// and our chip running faster.
+
+// Note, the SHFL instruction doesn't help us here because we're swaping different registers
+// from different threads.
+STORE_C:
+
+--:-:-:-:0      IADD cy00, cy00, 1;
+--:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
+--:-:-:-:0      IADD cy04, cy04, 1;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], cs4;
+--:-:-:-:0      IADD cy08, cy08, 1;
+--:-:-:-:1      LDS cs0, [readCs + 4x<0*128 + 00>];
+--:-:-:-:0      IADD cy12, cy12, 1;
+--:-:-:-:1      LDS cs1, [readCs + 4x<0*128 + 64>];
+--:-:-:-:0      IADD Cy00, Cy00, ldc1;
+--:-:-:-:1      LDS cs2, [readCs + 4x<1*128 + 00>];
+--:-:-:-:0      IADD Cy04, Cy04, ldc1;
+--:-:-:-:1      LDS cs3, [readCs + 4x<1*128 + 64>];
+--:-:-:-:0      IADD Cy08, Cy08, ldc1;
+--:-:-:-:1      LDS cs4, [readCs + 4x<2*128 + 00>];
+--:-:-:-:0      IADD Cy12, Cy12, ldc1;
+--:-:-:-:1      LDS cs5, [readCs + 4x<2*128 + 64>];
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
+--:-:-:-:1      LDS cs6, [readCs + 4x<3*128 + 00>];
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m
+--:-:1:-:1      LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1
+--:-:-:-:2      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
+--:-:-:Y:7      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m
+01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
+--:-:-:-:1  @P1 STG.CG [Cy00 + 4x<64>], cs1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m
+--:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
+--:-:-:-:1  @P3 STG.CG [Cy04 + 4x<64>], cs3;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m
+--:-:-:-:2  @P0 STG.CG [Cy08 + 4x<00>], cs4;
+--:-:-:-:2  @P1 STG.CG [Cy08 + 4x<64>], cs5;
+--:-:-:-:2  @P2 STG.CG [Cy12 + 4x<00>], cs6;
+--:2:-:-:1  @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2
+
+--:-:-:-:5      RET;
+
diff --git a/Assembler/PascalAs/sgemm/sgemm_pre_64.sass b/Assembler/PascalAs/sgemm/sgemm_pre_64.sass
new file mode 100644
index 0000000..aa2719e
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm_pre_64.sass
@@ -0,0 +1,867 @@
+# Kernel: sgemm_kernel_64
+#
+# SharedSize: 8192
+# Params(8):
+#   0:0x140:4:4 param_C,
+#   1:0x144:4:0 param_m,
+#   2:0x148:4:0 param_n,
+#   3:0x14c:4:0 param_k,
+#   4:0x150:4:0 param_lda,
+#   5:0x154:4:0 param_ldb,
+#   6:0x158:4:0 param_ldc
+#   7:0x15c:4:0 param_alpha
+#   8:0x160:4:4 param_D // for diagnostic printf output
+#
+# Globals:
+#   c[0x0][0x164]: texA (the value is 1)
+#   c[0x0][0x168]: texB (the value is 0)
+
+<REGISTER_MAPPING>
+
+    0-63    ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end
+
+    80      : zOffset
+    0-63    : cz<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35>
+     7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35>
+     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35>
+     5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35>
+    35,34,43,42,51,50,59,58 : cx32y<00-03|32-35>
+    39,38,47,46,55,54,63,62 : cx33y<00-03|32-35>
+    33,32,41,40,49,48,57,56 : cx34y<00-03|32-35>
+    37,36,45,44,53,52,61,60 : cx35y<00-03|32-35>
+
+    64-79   : j0Ax<00-03|32-35>, j0By<00-03|32-35>
+    80-95   : j1Ax<00-03|32-35>, j1By<00-03|32-35>
+
+    64-71   : cs<0-7>
+
+    96-111  : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3>
+
+    112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32
+
+    72-111  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
+--:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
+--:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
+
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1
+--:-:-:-:1      LOP.AND tid15, tid, 15;
+--:-:-:-:1      BFE.U32 tid2, tid, 0x104; // 1 bit at position 4
+--:-:-:-:1      MOV k, c[0x0][0x14c];
+--:-:-:-:1      BFE.U32 readAs, tid, 0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.AND readBs, tid, 0x30;
+--:-:-:-:1      LOP.AND tid1, tid, 1;
+--:-:-:-:1      SHL tid15_4, tid15, 4;
+--:-:-:-:1      LOP.AND zOffset, tid, -32;
+--:-:-:-:1      IADD k, k, -8;
+--:-:-:-:1      SHL readAs, readAs, 4;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:0 @!P0 MOV ldx4, c[0x0][0x150];
+--:-:-:-:1      STS.128 [zOffset + 4x<16*64>], RZ;
+--:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
+--:-:-:-:1      ISCADD writeS, tid2, tid15_4, 8;
+06:-:-:-:1      SEL blk, by, bx, P0;              // Wait Dep 2 & 3
+--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
+--:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
+--:-:-:-:1      LOP.OR readBs, readBs, tid1;
+--:-:-:-:1      SHR.U32 ldx, ldx4, 2;
+--:-:-:-:1      LOP.AND tid32, tid, 32;
+--:-:-:-:1      ISCADD track0, blk, tid15, 4;
+--:-:-:-:1      IADD ldx8, ldx4, ldx4;
+--:-:-:-:1  @P0 IADD writeS, writeS, 4x<8*64>;
+--:-:-:-:1      ISCADD readBs, readBs, 4x<8*64>, 4;
+--:-:-:-:1      XMAD.MRG xmad_t0, ldx, tid2.H1, RZ;
+--:-:-:-:1      XMAD.MRG xmad_end, k, ldx.H1, RZ;
+--:-:-:Y:6      XMAD track0, ldx, tid2, track0;
+--:-:-:-:2      XMAD.PSL.CBCC track0, ldx.H1, xmad_t0.H1, track0;
+--:-:1:-:4      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:-:-:1      IADD3 track2, track0, ldx, ldx;
+--:-:-:-:1      IADD track4, track0, ldx4;
+--:-:2:-:1      TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:-:0      XMAD end, k, ldx, track0;
+--:-:3:-:3      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1
+--:-:-:-:2      IADD track6, track2, ldx4;
+--:-:4:-:1      TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:-:1      XMAD.PSL.CBCC end, k.H1, xmad_end.H1, end;
+
+--:-:5:-:1      LDS.U.128 cz00, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz04, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz08, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz12, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz16, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz20, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz24, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz28, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz32, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz36, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz40, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz44, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz48, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz52, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz56, [zOffset + 4x<16*64>];
+--:-:5:-:1      LDS.U.128 cz60, [zOffset + 4x<16*64>];
+
+01:-:-:-:1      STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1
+--:-:-:-:0      IADD track0, track0, ldx8;
+02:-:-:-:1      STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2
+--:-:-:-:0      IADD track2, track2, ldx8;
+04:-:-:-:1      STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3
+--:-:-:-:0      IADD track4, track4, ldx8;
+08:-:-:-:1      STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4
+--:-:-:-:0      IADD track6, track6, ldx8;
+10:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*64>;
+
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1
+
+// Efficiency:
+// ffma: 512
+// lds:  32 dual issued
+// sts:  4  dual issued
+// tex:  4  dual issued
+// add:  4
+// xor:  3
+// setp: 1
+// bar:  1  dual issued
+// bra:  1  dual issued
+// Total: 520 (512/520 = 98.5% FFMA)
+
+LOOP:
+
+// Loop end condition
+--:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
+
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<1*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<1*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax32, [readAs + 4x<1*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By32, [readBs + 4x<1*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j0Ax34, j0By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j0Ax34, j0By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j0Ax32, j0By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j0Ax32, j0By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j0Ax35, j0By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j0Ax35, j0By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j0Ax33, j0By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j0Ax33, j0By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j0Ax35, j0By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j0Ax35, j0By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j0Ax33, j0By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j0Ax33, j0By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j0Ax34, j0By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j0Ax34, j0By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j0Ax32, j0By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j0Ax32, j0By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:0      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:-:1  @P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA cx02y32, j0Ax02, j0By32, cx02y32;
+--:-:-:-:0      FFMA cx02y33, j0Ax02, j0By33, cx02y33;
+--:-:2:-:1  @P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2
+--:-:-:-:1      FFMA cx00y33, j0Ax00, j0By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j0Ax00, j0By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j0Ax03, j0By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j0Ax03, j0By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j0Ax01, j0By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j0Ax01, j0By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j0Ax34, j0By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j0Ax34, j0By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j0Ax32, j0By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j0Ax32, j0By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j0Ax35, j0By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j0Ax35, j0By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j0Ax33, j0By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j0Ax33, j0By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j0Ax35, j0By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j0Ax35, j0By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j0Ax33, j0By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j0Ax33, j0By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j0Ax34, j0By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j0Ax34, j0By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j0Ax32, j0By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j0Ax32, j0By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j0Ax03, j0By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j0Ax03, j0By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j0Ax01, j0By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j0Ax01, j0By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j0Ax02, j0By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j0Ax02, j0By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j0Ax00, j0By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j0Ax00, j0By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<2*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<2*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<2*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<2*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j1Ax34, j1By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j1Ax34, j1By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j1Ax32, j1By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j1Ax32, j1By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j1Ax35, j1By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j1Ax35, j1By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j1Ax33, j1By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j1Ax33, j1By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j1Ax35, j1By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j1Ax35, j1By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j1Ax33, j1By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j1Ax33, j1By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j1Ax34, j1By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j1Ax34, j1By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j1Ax32, j1By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j1Ax32, j1By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:0      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:-:1  @P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf;
+--:-:-:Y:1      FFMA cx02y32, j1Ax02, j1By32, cx02y32;
+--:-:-:-:0      FFMA cx02y33, j1Ax02, j1By33, cx02y33;
+--:-:3:-:1  @P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3
+--:-:-:-:1      FFMA cx00y33, j1Ax00, j1By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j1Ax00, j1By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j1Ax03, j1By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j1Ax03, j1By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j1Ax01, j1By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j1Ax01, j1By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j1Ax34, j1By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j1Ax34, j1By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j1Ax32, j1By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j1Ax32, j1By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j1Ax35, j1By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j1Ax35, j1By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j1Ax33, j1By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j1Ax33, j1By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j1Ax35, j1By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j1Ax35, j1By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j1Ax33, j1By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j1Ax33, j1By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j1Ax34, j1By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j1Ax34, j1By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j1Ax32, j1By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j1Ax32, j1By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j1Ax03, j1By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j1Ax03, j1By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j1Ax01, j1By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j1Ax01, j1By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j1Ax02, j1By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j1Ax02, j1By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j1Ax00, j1By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j1Ax00, j1By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<3*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<3*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax32, [readAs + 4x<3*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By32, [readBs + 4x<3*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j0Ax34, j0By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j0Ax34, j0By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j0Ax32, j0By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j0Ax32, j0By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j0Ax35, j0By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j0Ax35, j0By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j0Ax33, j0By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j0Ax33, j0By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j0Ax35, j0By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j0Ax35, j0By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j0Ax33, j0By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j0Ax33, j0By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j0Ax34, j0By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j0Ax34, j0By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j0Ax32, j0By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j0Ax32, j0By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j0Ax02, j0By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j0Ax02, j0By33, cx02y33;
+--:-:-:-:1      FFMA cx00y33, j0Ax00, j0By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j0Ax00, j0By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j0Ax03, j0By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j0Ax03, j0By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j0Ax01, j0By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j0Ax01, j0By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j0Ax34, j0By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j0Ax34, j0By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j0Ax32, j0By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j0Ax32, j0By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j0Ax35, j0By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j0Ax35, j0By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j0Ax33, j0By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j0Ax33, j0By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j0Ax35, j0By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j0Ax35, j0By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j0Ax33, j0By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j0Ax33, j0By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j0Ax34, j0By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j0Ax34, j0By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j0Ax32, j0By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j0Ax32, j0By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j0Ax03, j0By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j0Ax03, j0By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j0Ax01, j0By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j0Ax01, j0By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j0Ax02, j0By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j0Ax02, j0By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j0Ax00, j0By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j0Ax00, j0By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<4*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<4*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<4*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<4*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j1Ax34, j1By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j1Ax34, j1By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j1Ax32, j1By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j1Ax32, j1By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j1Ax35, j1By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j1Ax35, j1By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j1Ax33, j1By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j1Ax33, j1By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j1Ax35, j1By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j1Ax35, j1By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j1Ax33, j1By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j1Ax33, j1By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j1Ax34, j1By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j1Ax34, j1By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j1Ax32, j1By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j1Ax32, j1By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j1Ax02, j1By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j1Ax02, j1By33, cx02y33;
+--:-:-:-:1      FFMA cx00y33, j1Ax00, j1By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j1Ax00, j1By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j1Ax03, j1By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j1Ax03, j1By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j1Ax01, j1By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j1Ax01, j1By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j1Ax34, j1By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j1Ax34, j1By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j1Ax32, j1By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j1Ax32, j1By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j1Ax35, j1By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j1Ax35, j1By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j1Ax33, j1By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j1Ax33, j1By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j1Ax35, j1By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j1Ax35, j1By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j1Ax33, j1By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j1Ax33, j1By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j1Ax34, j1By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j1Ax34, j1By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j1Ax32, j1By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j1Ax32, j1By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j1Ax03, j1By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j1Ax03, j1By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j1Ax01, j1By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j1Ax01, j1By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j1Ax02, j1By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j1Ax02, j1By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j1Ax00, j1By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j1Ax00, j1By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<5*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<5*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax32, [readAs + 4x<5*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By32, [readBs + 4x<5*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j0Ax34, j0By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j0Ax34, j0By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j0Ax32, j0By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j0Ax32, j0By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j0Ax35, j0By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j0Ax35, j0By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j0Ax33, j0By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j0Ax33, j0By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j0Ax35, j0By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j0Ax35, j0By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j0Ax33, j0By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j0Ax33, j0By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j0Ax34, j0By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j0Ax34, j0By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j0Ax32, j0By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j0Ax32, j0By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j0Ax02, j0By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j0Ax02, j0By33, cx02y33;
+--:-:-:-:1      FFMA cx00y33, j0Ax00, j0By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j0Ax00, j0By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j0Ax03, j0By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j0Ax03, j0By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j0Ax01, j0By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j0Ax01, j0By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j0Ax34, j0By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j0Ax34, j0By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j0Ax32, j0By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j0Ax32, j0By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j0Ax35, j0By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j0Ax35, j0By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j0Ax33, j0By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j0Ax33, j0By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j0Ax35, j0By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j0Ax35, j0By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j0Ax33, j0By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j0Ax33, j0By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j0Ax34, j0By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j0Ax34, j0By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j0Ax32, j0By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j0Ax32, j0By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j0Ax03, j0By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j0Ax03, j0By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j0Ax01, j0By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j0Ax01, j0By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j0Ax02, j0By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j0Ax02, j0By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j0Ax00, j0By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j0Ax00, j0By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<6*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<6*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<6*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<6*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j1Ax34, j1By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j1Ax34, j1By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j1Ax32, j1By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j1Ax32, j1By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j1Ax35, j1By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j1Ax35, j1By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j1Ax33, j1By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j1Ax33, j1By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j1Ax35, j1By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j1Ax35, j1By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j1Ax33, j1By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j1Ax33, j1By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j1Ax34, j1By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j1Ax34, j1By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j1Ax32, j1By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j1Ax32, j1By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:0      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+02:-:-:-:1  @P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j1Ax02, j1By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j1Ax02, j1By33, cx02y33;
+--:-:-:-:0      FFMA cx00y33, j1Ax00, j1By33, cx00y33;
+--:-:-:-:1  @P0 STS.128 [writeS + 4x<2*64>], loadX2;
+--:-:-:-:1      FFMA cx00y32, j1Ax00, j1By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j1Ax03, j1By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j1Ax03, j1By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j1Ax01, j1By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j1Ax01, j1By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j1Ax34, j1By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j1Ax34, j1By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j1Ax32, j1By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j1Ax32, j1By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j1Ax35, j1By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j1Ax35, j1By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j1Ax33, j1By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j1Ax33, j1By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j1Ax35, j1By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j1Ax35, j1By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j1Ax33, j1By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j1Ax33, j1By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j1Ax34, j1By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j1Ax34, j1By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j1Ax32, j1By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j1Ax32, j1By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j1Ax03, j1By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j1Ax03, j1By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j1Ax01, j1By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j1Ax01, j1By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j1Ax02, j1By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j1Ax02, j1By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j1Ax00, j1By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j1Ax00, j1By34, cx00y34;
+01:-:-:-:0      FFMA cx02y00, j0Ax02, j0By00, cx02y00; // Wait Dep 1
+--:-:-:-:1      LDS.U.128 j1Ax00, [readAs + 4x<7*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j0Ax02, j0By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j0Ax00, j0By01, cx00y01;
+--:-:-:-:1      LDS.U.128 j1By00, [readBs + 4x<7*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j0Ax00, j0By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j0Ax03, j0By00, cx03y00;
+--:-:-:-:1      LDS.U.128 j1Ax32, [readAs + 4x<7*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j0Ax03, j0By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j0Ax01, j0By01, cx01y01;
+--:-:1:-:1      LDS.U.128 j1By32, [readBs + 4x<7*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j0Ax01, j0By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j0Ax34, j0By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j0Ax34, j0By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j0Ax32, j0By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j0Ax32, j0By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j0Ax35, j0By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j0Ax35, j0By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j0Ax33, j0By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j0Ax33, j0By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j0Ax35, j0By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j0Ax35, j0By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j0Ax33, j0By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j0Ax33, j0By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j0Ax34, j0By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j0Ax34, j0By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j0Ax32, j0By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j0Ax32, j0By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j0Ax03, j0By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j0Ax03, j0By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j0Ax01, j0By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j0Ax01, j0By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j0Ax02, j0By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j0Ax02, j0By03, cx02y03;
+--:-:-:-:0      FFMA cx00y03, j0Ax00, j0By03, cx00y03;
+04:-:-:-:1  @P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3
+--:-:-:-:1      FFMA cx00y02, j0Ax00, j0By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j0Ax02, j0By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j0Ax02, j0By33, cx02y33;
+--:-:-:-:0      FFMA cx00y33, j0Ax00, j0By33, cx00y33;
+--:-:-:-:1  @P0 STS.128 [writeS + 4x<6*64>], loadX6;
+--:-:-:-:1      FFMA cx00y32, j0Ax00, j0By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j0Ax03, j0By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j0Ax03, j0By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j0Ax01, j0By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j0Ax01, j0By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j0Ax34, j0By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j0Ax34, j0By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j0Ax32, j0By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j0Ax32, j0By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j0Ax35, j0By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j0Ax35, j0By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j0Ax33, j0By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j0Ax33, j0By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j0Ax35, j0By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j0Ax35, j0By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j0Ax33, j0By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j0Ax33, j0By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j0Ax34, j0By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j0Ax34, j0By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j0Ax32, j0By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j0Ax32, j0By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j0Ax03, j0By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j0Ax03, j0By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j0Ax01, j0By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j0Ax01, j0By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j0Ax02, j0By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j0Ax02, j0By35, cx02y35;
+--:-:-:-:0      FFMA cx00y35, j0Ax00, j0By35, cx00y35;
+01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1
+--:-:-:-:1  @P0 LOP.XOR readAs, readAs, 4x<16*64>;
+--:-:-:-:1  @P0 LOP.XOR readBs, readBs, 4x<16*64>;
+--:-:-:-:1  @P0 LOP.XOR writeS, writeS, 4x<16*64>;
+--:-:-:-:1      FFMA cx00y34, j0Ax00, j0By34, cx00y34;
+--:-:-:-:0      FFMA cx02y00, j1Ax02, j1By00, cx02y00;
+--:-:-:-:1  @P0 LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>];
+--:-:-:-:1      FFMA cx02y01, j1Ax02, j1By01, cx02y01;
+--:-:-:-:0      FFMA cx00y01, j1Ax00, j1By01, cx00y01;
+--:-:-:-:1  @P0 LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      FFMA cx00y00, j1Ax00, j1By00, cx00y00;
+--:-:-:-:0      FFMA cx03y00, j1Ax03, j1By00, cx03y00;
+--:-:-:-:1  @P0 LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>];
+--:-:-:-:1      FFMA cx03y01, j1Ax03, j1By01, cx03y01;
+--:-:-:-:0      FFMA cx01y01, j1Ax01, j1By01, cx01y01;
+--:-:1:-:1  @P0 LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1
+--:-:-:-:1      FFMA cx01y00, j1Ax01, j1By00, cx01y00;
+--:-:-:-:1      FFMA cx34y00, j1Ax34, j1By00, cx34y00;
+--:-:-:-:1      FFMA cx34y01, j1Ax34, j1By01, cx34y01;
+--:-:-:-:1      FFMA cx32y01, j1Ax32, j1By01, cx32y01;
+--:-:-:-:1      FFMA cx32y00, j1Ax32, j1By00, cx32y00;
+--:-:-:-:1      FFMA cx35y00, j1Ax35, j1By00, cx35y00;
+--:-:-:-:1      FFMA cx35y01, j1Ax35, j1By01, cx35y01;
+--:-:-:-:1      FFMA cx33y01, j1Ax33, j1By01, cx33y01;
+--:-:-:-:1      FFMA cx33y00, j1Ax33, j1By00, cx33y00;
+--:-:-:-:1      FFMA cx35y02, j1Ax35, j1By02, cx35y02;
+--:-:-:-:1      FFMA cx35y03, j1Ax35, j1By03, cx35y03;
+--:-:-:-:1      FFMA cx33y03, j1Ax33, j1By03, cx33y03;
+--:-:-:-:1      FFMA cx33y02, j1Ax33, j1By02, cx33y02;
+--:-:-:-:1      FFMA cx34y02, j1Ax34, j1By02, cx34y02;
+--:-:-:-:1      FFMA cx34y03, j1Ax34, j1By03, cx34y03;
+--:-:-:-:1      FFMA cx32y03, j1Ax32, j1By03, cx32y03;
+--:-:-:-:1      FFMA cx32y02, j1Ax32, j1By02, cx32y02;
+--:-:-:-:1      FFMA cx03y02, j1Ax03, j1By02, cx03y02;
+--:-:-:-:1      FFMA cx03y03, j1Ax03, j1By03, cx03y03;
+--:-:-:-:1      FFMA cx01y03, j1Ax01, j1By03, cx01y03;
+--:-:-:-:1      FFMA cx01y02, j1Ax01, j1By02, cx01y02;
+--:-:-:-:1      FFMA cx02y02, j1Ax02, j1By02, cx02y02;
+--:-:-:-:1      FFMA cx02y03, j1Ax02, j1By03, cx02y03;
+--:-:-:-:1      FFMA cx00y03, j1Ax00, j1By03, cx00y03;
+--:-:-:-:1      FFMA cx00y02, j1Ax00, j1By02, cx00y02;
+--:-:-:Y:1      FFMA cx02y32, j1Ax02, j1By32, cx02y32;
+--:-:-:-:1      FFMA cx02y33, j1Ax02, j1By33, cx02y33;
+--:-:-:-:1      FFMA cx00y33, j1Ax00, j1By33, cx00y33;
+--:-:-:-:1      FFMA cx00y32, j1Ax00, j1By32, cx00y32;
+--:-:-:-:1      FFMA cx03y32, j1Ax03, j1By32, cx03y32;
+--:-:-:-:1      FFMA cx03y33, j1Ax03, j1By33, cx03y33;
+--:-:-:-:1      FFMA cx01y33, j1Ax01, j1By33, cx01y33;
+--:-:-:-:1      FFMA cx01y32, j1Ax01, j1By32, cx01y32;
+--:-:-:-:1      FFMA cx34y32, j1Ax34, j1By32, cx34y32;
+--:-:-:-:1      FFMA cx34y33, j1Ax34, j1By33, cx34y33;
+--:-:-:-:1      FFMA cx32y33, j1Ax32, j1By33, cx32y33;
+--:-:-:-:1      FFMA cx32y32, j1Ax32, j1By32, cx32y32;
+--:-:-:-:1      FFMA cx35y32, j1Ax35, j1By32, cx35y32;
+--:-:-:-:1      FFMA cx35y33, j1Ax35, j1By33, cx35y33;
+--:-:-:-:1      FFMA cx33y33, j1Ax33, j1By33, cx33y33;
+--:-:-:-:1      FFMA cx33y32, j1Ax33, j1By32, cx33y32;
+--:-:-:-:1      FFMA cx35y34, j1Ax35, j1By34, cx35y34;
+--:-:-:-:1      FFMA cx35y35, j1Ax35, j1By35, cx35y35;
+--:-:-:-:1      FFMA cx33y35, j1Ax33, j1By35, cx33y35;
+--:-:-:-:1      FFMA cx33y34, j1Ax33, j1By34, cx33y34;
+--:-:-:-:1      FFMA cx34y34, j1Ax34, j1By34, cx34y34;
+--:-:-:-:1      FFMA cx34y35, j1Ax34, j1By35, cx34y35;
+--:-:-:-:1      FFMA cx32y35, j1Ax32, j1By35, cx32y35;
+--:-:-:-:1      FFMA cx32y34, j1Ax32, j1By34, cx32y34;
+--:-:-:-:1      FFMA cx03y34, j1Ax03, j1By34, cx03y34;
+--:-:-:-:1      FFMA cx03y35, j1Ax03, j1By35, cx03y35;
+--:-:-:-:1      FFMA cx01y35, j1Ax01, j1By35, cx01y35;
+--:-:-:-:1      FFMA cx01y34, j1Ax01, j1By34, cx01y34;
+--:-:-:-:1      FFMA cx02y34, j1Ax02, j1By34, cx02y34;
+--:-:-:-:1      FFMA cx02y35, j1Ax02, j1By35, cx02y35;
+--:-:-:-:1      FFMA cx00y35, j1Ax00, j1By35, cx00y35;
+--:-:-:-:1      FFMA cx00y34, j1Ax00, j1By34, cx00y34;
+--:-:-:-:1  @P0 IADD track0, track0, ldx8;
+--:-:-:-:1  @P0 IADD track2, track2, ldx8;
+--:-:-:-:1  @P0 IADD track4, track4, ldx8;
+--:-:-:-:0  @P0 IADD track6, track6, ldx8;
+--:-:-:Y:5  @P0 BRA LOOP;
+
+--:-:-:-:1      SHR.U32 cy00, tid32, 1;
+--:-:-:-:1      MOV ldc, c[0x0][0x158];
+--:-:-:-:1      ISCADD cx, bx, tid31, 6;
+--:-:-:-:1      MOV alpha, c[0x0][0x15c];
+--:-:-:-:1      ISCADD readCs, tid32, tid31, 3;
+--:-:-:-:1      LOP.AND readAs, readAs, 0x7ff;
+--:-:-:-:1      ISCADD cy00, by, cy00, 6;
+--:-:-:-:1      LOP.AND readBs, readBs, 0x7ff;
+--:-:-:-:1      SHL ldc1, ldc, 2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
+--:-:-:-:1      SHL ldc4, ldc, 4;
+--:-:-:-:1      FMUL cs0, cx00y00, alpha;
+--:-:-:-:1      SHL ldc8, ldc, 5;
+--:-:-:-:1      XMAD.MRG xmad_ci, cy00, ldc.H1, RZ;
+--:-:-:-:1      ISCADD writeCs, readBs, readAs, 4;
+--:-:-:-:1      XMAD ci, cy00, ldc, cx;
+--:-:-:-:1      SHL readCs, readCs, 2;
+--:-:-:-:1      IADD cx, cx, 32;
+--:-:-:-:1      ISCADD ldc28, ldc, -ldc4, 7;
+--:-:-:-:1      FMUL cs1, cx01y00, alpha;
+--:-:-:-:1      FMUL cs2, cx02y00, alpha;
+--:-:-:-:1      XMAD.PSL.CBCC ci, cy00.H1, xmad_ci.H1, ci;
+--:-:-:-:1      IADD cy00, cy00, -1;
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
+--:-:-:-:1      FMUL cs3, cx03y00, alpha;
+--:-:-:-:1      FMUL cs4, cx32y00, alpha;
+--:-:-:-:1      FMUL cs5, cx33y00, alpha;
+--:-:-:-:1      ISCADD Cy00, ci, c[0x0][0x140], 2;
+--:-:-:-:1      IADD cy04, cy00, 4;
+--:-:-:-:1      IADD cy08, cy00, 8;
+--:-:-:-:1      IADD cy12, cy00, 12;
+--:-:-:-:1      FMUL cs6, cx34y00, alpha;
+--:-:-:-:1      FMUL cs7, cx35y00, alpha;
+--:-:-:Y:6      IADD Cy00, Cy00, -ldc1;
+--:-:-:-:1      IADD Cy04, Cy00, ldc4;
+--:-:-:Y:5      IADD Cy08, Cy00, ldc8;
+--:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
+
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y01, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y01, alpha;
+--:-:-:-:1      FMUL cs2, cx02y01, alpha;
+--:-:-:-:1      FMUL cs3, cx03y01, alpha;
+--:-:-:-:1      FMUL cs4, cx32y01, alpha;
+--:-:-:-:1      FMUL cs5, cx33y01, alpha;
+--:-:-:-:1      FMUL cs6, cx34y01, alpha;
+--:-:-:-:0      FMUL cs7, cx35y01, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y02, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y02, alpha;
+--:-:-:-:1      FMUL cs2, cx02y02, alpha;
+--:-:-:-:1      FMUL cs3, cx03y02, alpha;
+--:-:-:-:1      FMUL cs4, cx32y02, alpha;
+--:-:-:-:1      FMUL cs5, cx33y02, alpha;
+--:-:-:-:1      FMUL cs6, cx34y02, alpha;
+--:-:-:-:0      FMUL cs7, cx35y02, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y03, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y03, alpha;
+--:-:-:-:1      FMUL cs2, cx02y03, alpha;
+--:-:-:-:1      FMUL cs3, cx03y03, alpha;
+--:-:-:-:1      FMUL cs4, cx32y03, alpha;
+--:-:-:-:1      FMUL cs5, cx33y03, alpha;
+--:-:-:-:1      FMUL cs6, cx34y03, alpha;
+--:-:-:-:0      FMUL cs7, cx35y03, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      IADD cy00, cy00, 28;
+--:-:-:-:1      IADD cy04, cy04, 28;
+--:-:-:-:1      IADD cy08, cy08, 28;
+--:-:-:-:1      IADD cy12, cy12, 28;
+
+02:-:-:-:1      IADD Cy00, Cy00, ldc28; // Wait Dep 2
+--:-:-:-:1      IADD Cy04, Cy04, ldc28;
+--:-:-:-:1      IADD Cy08, Cy08, ldc28;
+--:-:-:-:1      IADD Cy12, Cy12, ldc28;
+
+--:-:-:-:1      FMUL cs0, cx00y32, alpha;
+--:-:-:-:1      FMUL cs1, cx01y32, alpha;
+--:-:-:-:1      FMUL cs2, cx02y32, alpha;
+--:-:-:-:1      FMUL cs3, cx03y32, alpha;
+--:-:-:-:1      FMUL cs4, cx32y32, alpha;
+--:-:-:-:1      FMUL cs5, cx33y32, alpha;
+--:-:-:-:1      FMUL cs6, cx34y32, alpha;
+--:-:-:-:0      FMUL cs7, cx35y32, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y33, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y33, alpha;
+--:-:-:-:1      FMUL cs2, cx02y33, alpha;
+--:-:-:-:1      FMUL cs3, cx03y33, alpha;
+--:-:-:-:1      FMUL cs4, cx32y33, alpha;
+--:-:-:-:1      FMUL cs5, cx33y33, alpha;
+--:-:-:-:1      FMUL cs6, cx34y33, alpha;
+--:-:-:-:0      FMUL cs7, cx35y33, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y34, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y34, alpha;
+--:-:-:-:1      FMUL cs2, cx02y34, alpha;
+--:-:-:-:1      FMUL cs3, cx03y34, alpha;
+--:-:-:-:1      FMUL cs4, cx32y34, alpha;
+--:-:-:-:1      FMUL cs5, cx33y34, alpha;
+--:-:-:-:1      FMUL cs6, cx34y34, alpha;
+--:-:-:-:0      FMUL cs7, cx35y34, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+02:-:-:-:1      FMUL cs0, cx00y35, alpha; // Wait Dep 2
+--:-:-:-:1      FMUL cs1, cx01y35, alpha;
+--:-:-:-:1      FMUL cs2, cx02y35, alpha;
+--:-:-:-:1      FMUL cs3, cx03y35, alpha;
+--:-:-:-:1      FMUL cs4, cx32y35, alpha;
+--:-:-:-:1      FMUL cs5, cx33y35, alpha;
+--:-:-:-:1      FMUL cs6, cx34y35, alpha;
+--:-:-:-:0      FMUL cs7, cx35y35, alpha; // Dual Issue
+--:-:-:-:5      CAL STORE_C;
+
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:0      IADD cy00, cy00, 1;
+--:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
+--:-:-:-:0      IADD cy04, cy04, 1;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], cs4;
+--:-:-:-:0      IADD cy08, cy08, 1;
+--:-:-:-:1      LDS cs0, [readCs + 4x<0*64 + 00>];
+--:-:-:-:0      IADD cy12, cy12, 1;
+--:-:-:-:1      LDS cs1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:0      IADD Cy00, Cy00, ldc1;
+--:-:-:-:1      LDS cs2, [readCs + 4x<1*64 + 00>];
+--:-:-:-:0      IADD Cy04, Cy04, ldc1;
+--:-:-:-:1      LDS cs3, [readCs + 4x<1*64 + 32>];
+--:-:-:-:0      IADD Cy08, Cy08, ldc1;
+--:-:-:-:1      LDS cs4, [readCs + 4x<2*64 + 00>];
+--:-:-:-:0      IADD Cy12, Cy12, ldc1;
+--:-:-:-:1      LDS cs5, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
+--:-:-:-:1      LDS cs6, [readCs + 4x<3*64 + 00>];
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m
+--:-:1:-:1      LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1
+--:-:-:-:2      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
+--:-:-:Y:7      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m
+01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
+--:-:-:-:1  @P1 STG.CG [Cy00 + 4x<32>], cs1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m
+--:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
+--:-:-:-:1  @P3 STG.CG [Cy04 + 4x<32>], cs3;
+--:-:-:Y:7      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m
+--:-:-:-:2  @P0 STG.CG [Cy08 + 4x<00>], cs4;
+--:-:-:-:2  @P1 STG.CG [Cy08 + 4x<32>], cs5;
+--:-:-:-:2  @P2 STG.CG [Cy12 + 4x<00>], cs6;
+--:2:-:-:1  @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2
+
+--:-:-:-:5      RET;
+
diff --git a/Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin b/Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin
new file mode 100644
index 0000000..0c7825f
Binary files /dev/null and b/Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin differ
diff --git a/Assembler/PascalAs/sgemm/sgemm_sm52_64_dump.sass b/Assembler/PascalAs/sgemm/sgemm_sm52_64_dump.sass
new file mode 100644
index 0000000..552d95b
--- /dev/null
+++ b/Assembler/PascalAs/sgemm/sgemm_sm52_64_dump.sass
@@ -0,0 +1,1100 @@
+
+	code for sm_52
+		Function : sgemm_kernel_128
+	.headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
+                                                                                        /* 0x001ffc00e22007f6 */
+        /*0008*/                   MOV R1, c[0x0][0x20];                                /* 0x4c98078000870001 */
+        /*0010*/                   S2R R8, SR_TID.X;                                    /* 0xf0c8000002170008 */
+        /*0018*/                   SSY 0x90;                                            /* 0xe290000007000000 */
+                                                                                        /* 0x001fc400ffa00fed */
+        /*0028*/                   ISETP.GT.AND P0, PT, R8, 0x7f, PT;                   /* 0x3669038007f70807 */
+        /*0030*/              @!P0 BRA 0x60;                                            /* 0xe24000000288000f */
+        /*0038*/                   MOV R0, c[0x0][0x170];                               /* 0x4c98078005c70000 */
+                                                                                        /* 0x001ff400fe0007f5 */
+        /*0048*/                   MOV32I R3, 0x20000000;                               /* 0x010200000007f003 */
+        /*0050*/         {         LOP32I.OR R2, R0, 0x80000000;                        /* 0x0428000000070002 */
+        /*0058*/                   SYNC;        }                                       /* 0xf0f800000007000f */
+                                                                                        /* 0x001fc000fea007f1 */
+        /*0068*/                   MOV R0, c[0x0][0x174];                               /* 0x4c98078005d70000 */
+        /*0070*/                   MOV32I R3, 0x20000000;                               /* 0x010200000007f003 */
+        /*0078*/         {         LOP32I.OR R2, R0, 0x80000000;                        /* 0x0428000000070002 */
+        /*0088*/                   SYNC;        }                                       /* 0x001fd0800e2007fd */
+                                                                                        /* 0xf0f800000007000f */
+        /*0090*/                   TLD.B.LZ.NODEP.P R4, R8, R2, 0x0, 1D, 0xf;           /* 0xdd3a000780270804 */
+        /*0098*/                   SHL R0, R8, 0x4;                                     /* 0x3848000000470800 */
+                                                                                        /* 0x081fc403ffe041f2 */
+        /*00a8*/                   STS.128 [R0], R4;                                    /* 0xef5e000000070004 */
+        /*00b0*/                   BAR.SYNC 0x0;                                        /* 0xf0a81b8000070000 */
+        /*00b8*/                   IADD32I R0, -R8.reuse, 0xff;                         /* 0x1d0000000ff70800 */
+                                                                                        /* 0x001fc000fe8207f5 */
+        /*00c8*/                   SHL R2, R8.reuse, 0x2;                               /* 0x3848000000270802 */
+        /*00d0*/                   SHL R0, R0, 0x4;                                     /* 0x3848000000470000 */
+        /*00d8*/         {         IADD R4.CC, R2, c[0x0][0x140];                       /* 0x4c10800005070204 */
+        /*00e8*/                   LDS.U.32 R0, [R0];        }                          /* 0x001fc400fec00711 */
+                                                                                        /* 0xef4c100000070000 */
+        /*00f0*/                   SHR R2, R8, 0x1e;                                    /* 0x3829000001e70802 */
+        /*00f8*/                   IADD.X R3, R2, c[0x0][0x144];                        /* 0x4c10080005170203 */
+                                                                                        /* 0x001ffc011e2007ff */
+        /*0108*/                   MOV R2, R4;                                          /* 0x5c98078000470002 */
+        /*0110*/                   STG.E [R2], R0;                                      /* 0xeedc200000070200 */
+        /*0118*/                   EXIT;                                                /* 0xe30000000007000f */
+                                                                                        /* 0x001f8000fc0007ff */
+        /*0128*/                   BRA 0x120;                                           /* 0xe2400fffff07000f */
+        /*0130*/                   NOP;                                                 /* 0x50b0000000070f00 */
+        /*0138*/                   NOP;                                                 /* 0x50b0000000070f00 */
+		.................................
+
+
+		Function : sgemm_kernel_64
+	.headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
+                                                                                                /* 0x001d4400e6200711 */
+        /*0008*/                   S2R R119, SR_TID.X;                                          /* 0xf0c8000002170077 */
+        /*0010*/                   S2R R125, SR_CTAID.X;                                        /* 0xf0c800000257007d */
+        /*0018*/                   S2R R122, SR_CTAID.Y;                                        /* 0xf0c800000267007a */
+                                                                                                /* 0x081fc440fe220ff1 */
+        /*0028*/                   ISETP.GE.AND P0, PT, R119.reuse, 0x20, PT;                   /* 0x366d038002077707 */
+        /*0030*/                   LOP.AND R9, R119.reuse, 0xf;                                 /* 0x3847000000f77709 */
+        /*0038*/                   BFE.U32 R4, R119.reuse, 0x104;                               /* 0x3800000010477704 */
+                                                                                                /* 0x081fc440fe2007f1 */
+        /*0048*/                   MOV R12, c[0x0][0x14c];                                      /* 0x4c9807800537000c */
+        /*0050*/                   BFE.U32 R114, R119.reuse, 0x301;                             /* 0x3800000030177772 */
+        /*0058*/                   LOP.AND R115, R119.reuse, 0x30;                              /* 0x3847000003077773 */
+                                                                                                /* 0x081fc400fe2207f1 */
+        /*0068*/                   LOP.AND R0, R119.reuse, 0x1;                                 /* 0x3847000000177700 */
+        /*0070*/                   SHL R13, R9, 0x4;                                            /* 0x384800000047090d */
+        /*0078*/                   LOP.AND R80, R119.reuse, -0x20;                              /* 0x3947007ffe077750 */
+                                                                                                /* 0x081fc400fe2007f1 */
+        /*0088*/                   IADD R12, R12, -0x8;                                         /* 0x3910007fff870c0c */
+        /*0090*/                   SHL R114, R114, 0x4;                                         /* 0x3848000000477272 */
+        /*0098*/                   LOP.AND R126, R119.reuse, 0x1f;                              /* 0x3847000001f7777e */
+                                                                                                /* 0x001fc400fe2007f0 */
+        /*00a8*/         {         SHR.U32 R115, R115, 0x3;                                     /* 0x3828000000377373 */
+        /*00b0*/                   STS.128 [R80+0x1000], RZ;        }                           /* 0xef5e0001000750ff */
+        /*00b8*/              @!P0 MOV R2, c[0x0][0x150];                                       /* 0x4c98078005480002 */
+                                                                                                /* 0x00dfc400fe2007f1 */
+        /*00c8*/                   ISCADD R118, R4, R13, 0x8;                                   /* 0x5c18040000d70476 */
+        /*00d0*/               @P0 MOV R2, c[0x0][0x154];                                       /* 0x4c98078005500002 */
+        /*00d8*/                   SEL R8, R122, R125, P0;                                      /* 0x5ca0000007d77a08 */
+                                                                                                /* 0x001fc400fe2007f1 */
+        /*00e8*/              @!P0 MOV32I R113, 0x80000001;                                     /* 0x010800000018f071 */
+        /*00f0*/               @P0 MOV32I R113, 0x80000000;                                     /* 0x010800000000f071 */
+        /*00f8*/                   LOP.OR R115, R115, R0;                                       /* 0x5c47020000077373 */
+                                                                                                /* 0x001fc440fe2007f1 */
+        /*0108*/                   LOP.AND R123, R119, 0x20;                                    /* 0x384700000207777b */
+        /*0110*/                   SHR.U32 R1, R2.reuse, 0x2;                                   /* 0x3828000000270201 */
+        /*0118*/                   IADD R121, R2, R2;                                           /* 0x5c10000000270279 */
+                                                                                                /* 0x001fc800fe2007f1 */
+        /*0128*/                   ISCADD R112, R8, R9, 0x4;                                    /* 0x5c18020000970870 */
+        /*0130*/               @P0 IADD R118, R118, 0x800;                                      /* 0x3810000080007676 */
+        /*0138*/                   ISCADD R115, R115, 0x800, 0x4;                               /* 0x3818020080077373 */
+                                                                                                /* 0x081f98c0fe2607f1 */
+        /*0148*/                   XMAD.MRG R5, R1.reuse, R4.H1.reuse, RZ;                      /* 0x5b007fa800470105 */
+        /*0150*/                   XMAD.MRG R16, R12.reuse, R1.H1.reuse, RZ;                    /* 0x5b007fa800170c10 */
+        /*0158*/                   XMAD R112, R1.reuse, R4, R112;                               /* 0x5b00380000470170 */
+                                                                                                /* 0x181fc480e28007f2 */
+        /*0168*/                   XMAD.PSL.CBCC R112, R1.H1, R5.H1, R112;                      /* 0x5b30381800570170 */
+        /*0170*/                   TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf;                    /* 0xdd38000787177060 */
+        /*0178*/                   IADD3 R116, R112.reuse, R1.reuse, R1;                        /* 0x5cc0008000177074 */
+                                                                                                /* 0x081fc080e62407f1 */
+        /*0188*/                   IADD R120, R112, R2.reuse;                                   /* 0x5c10000000277078 */
+        /*0190*/                   TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf;                   /* 0xdd38000787177464 */
+        /*0198*/         {         XMAD R117, R12.reuse, R1, R112;                              /* 0x5b00380000170c75 */
+        /*01a8*/                   TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf;        }          /* 0x101dc400fe440753 */
+                                                                                                /* 0xdd38000787177868 */
+        /*01b0*/                   IADD R124, R116, R2;                                         /* 0x5c1000000027747c */
+        /*01b8*/                   TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf;                   /* 0xdd38000787177c6c */
+                                                                                                /* 0x001e4400f22007f1 */
+        /*01c8*/                   XMAD.PSL.CBCC R117, R12.H1, R16.H1, R117;                    /* 0x5b303a9801070c75 */
+        /*01d0*/                   LDS.U.128 R0, [R80+0x1000];                                  /* 0xef4e100100075000 */
+        /*01d8*/                   LDS.U.128 R4, [R80+0x1000];                                  /* 0xef4e100100075004 */
+                                                                                                /* 0x001e4400f2200791 */
+        /*01e8*/                   LDS.U.128 R8, [R80+0x1000];                                  /* 0xef4e100100075008 */
+        /*01f0*/                   LDS.U.128 R12, [R80+0x1000];                                 /* 0xef4e10010007500c */
+        /*01f8*/                   LDS.U.128 R16, [R80+0x1000];                                 /* 0xef4e100100075010 */
+                                                                                                /* 0x001e4400f2200791 */
+        /*0208*/                   LDS.U.128 R20, [R80+0x1000];                                 /* 0xef4e100100075014 */
+        /*0210*/                   LDS.U.128 R24, [R80+0x1000];                                 /* 0xef4e100100075018 */
+        /*0218*/                   LDS.U.128 R28, [R80+0x1000];                                 /* 0xef4e10010007501c */
+                                                                                                /* 0x001e4400f2200791 */
+        /*0228*/                   LDS.U.128 R32, [R80+0x1000];                                 /* 0xef4e100100075020 */
+        /*0230*/                   LDS.U.128 R36, [R80+0x1000];                                 /* 0xef4e100100075024 */
+        /*0238*/                   LDS.U.128 R40, [R80+0x1000];                                 /* 0xef4e100100075028 */
+                                                                                                /* 0x001e4400f2200791 */
+        /*0248*/                   LDS.U.128 R44, [R80+0x1000];                                 /* 0xef4e10010007502c */
+        /*0250*/                   LDS.U.128 R48, [R80+0x1000];                                 /* 0xef4e100100075030 */
+        /*0258*/                   LDS.U.128 R52, [R80+0x1000];                                 /* 0xef4e100100075034 */
+                                                                                                /* 0x003fc400f2200791 */
+        /*0268*/                   LDS.U.128 R56, [R80+0x1000];                                 /* 0xef4e100100075038 */
+        /*0270*/                   LDS.U.128 R60, [R80+0x1000];                                 /* 0xef4e10010007503c */
+        /*0278*/                   STS.128 [R118], R96;                                         /* 0xef5e000000077660 */
+                                                                                                /* 0x101fc002fe2407f0 */
+        /*0288*/         {         IADD R112, R112, R121.reuse;                                 /* 0x5c10000007977070 */
+        /*0290*/                   STS.128 [R118+0x200], R100;        }                         /* 0xef5e000020077664 */
+        /*0298*/         {         IADD R116, R116, R121.reuse;                                 /* 0x5c10000007977474 */
+        /*02a8*/                   STS.128 [R118+0x400], R104;        }                         /* 0x011fc480fe0027f1 */
+                                                                                                /* 0xef5e000040077668 */
+        /*02b0*/         {         IADD R120, R120, R121.reuse;                                 /* 0x5c10000007977878 */
+        /*02b8*/                   STS.128 [R118+0x600], R108;        }                         /* 0xef5e00006007766c */
+                                                                                                /* 0x001fc010fea007f0 */
+        /*02c8*/         {         IADD R124, R124, R121;                                       /* 0x5c10000007977c7c */
+        /*02d0*/                   BAR.SYNC 0x0;        }                                       /* 0xf0a81b8000070000 */
+        /*02d8*/         {         LOP.XOR R118, R118, 0x1000;                                  /* 0x3847040100077676 */
+        /*02e8*/                   LDS.U.128 R64, [R114];        }                              /* 0x001fc400fe2007f1 */
+                                                                                                /* 0xef4e100000077240 */
+        /*02f0*/                   LDS.U.128 R72, [R115];                                       /* 0xef4e100000077348 */
+        /*02f8*/                   LDS.U.128 R68, [R114+0x80];                                  /* 0xef4e100008077244 */
+                                                                                                /* 0x183fc000fe200711 */
+        /*0308*/                   LDS.U.128 R76, [R115+0x80];                                  /* 0xef4e10000807734c */
+        /*0310*/                   ISETP.LE.AND P0, PT, R112, R117, PT;                         /* 0x5b67038007577007 */
+        /*0318*/         {         FFMA R1, R66.reuse, R72.reuse, R1;                           /* 0x5980008004874201 */
+        /*0328*/                   LDS.U.128 R80, [R114+0x100];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100010077250 */
+        /*0330*/                   FFMA R0, R66, R73.reuse, R0;                                 /* 0x5980000004974200 */
+        /*0338*/         {         FFMA R2, R64.reuse, R73.reuse, R2;                           /* 0x5980010004974002 */
+        /*0348*/                   LDS.U.128 R88, [R115+0x100];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100010077358 */
+        /*0350*/                   FFMA R3, R64, R72.reuse, R3;                                 /* 0x5980018004874003 */
+        /*0358*/         {         FFMA R5, R67.reuse, R72.reuse, R5;                           /* 0x5980028004874305 */
+        /*0368*/                   LDS.U.128 R84, [R114+0x180];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100018077254 */
+        /*0370*/                   FFMA R4, R67, R73.reuse, R4;                                 /* 0x5980020004974304 */
+        /*0378*/         {         FFMA R6, R65.reuse, R73.reuse, R6;                           /* 0x5980030004974106 */
+        /*0388*/                   LDS.U.128 R92, [R115+0x180];        }                        /* 0x181fc480fe200711 */
+                                                                                                /* 0xef4e10001807735c */
+        /*0390*/                   FFMA R7, R65, R72.reuse, R7;                                 /* 0x5980038004874107 */
+        /*0398*/                   FFMA R33, R70.reuse, R72.reuse, R33;                         /* 0x5980108004874621 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*03a8*/                   FFMA R32, R70, R73.reuse, R32;                               /* 0x5980100004974620 */
+        /*03b0*/                   FFMA R34, R68.reuse, R73.reuse, R34;                         /* 0x5980110004974422 */
+        /*03b8*/                   FFMA R35, R68, R72.reuse, R35;                               /* 0x5980118004874423 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*03c8*/                   FFMA R37, R71.reuse, R72.reuse, R37;                         /* 0x5980128004874725 */
+        /*03d0*/                   FFMA R36, R71.reuse, R73.reuse, R36;                         /* 0x5980120004974724 */
+        /*03d8*/                   FFMA R38, R69.reuse, R73, R38;                               /* 0x5980130004974526 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*03e8*/                   FFMA R39, R69.reuse, R72, R39;                               /* 0x5980138004874527 */
+        /*03f0*/                   FFMA R45, R71.reuse, R74.reuse, R45;                         /* 0x5980168004a7472d */
+        /*03f8*/                   FFMA R44, R71, R75.reuse, R44;                               /* 0x5980160004b7472c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0408*/                   FFMA R46, R69.reuse, R75.reuse, R46;                         /* 0x5980170004b7452e */
+        /*0410*/                   FFMA R47, R69, R74.reuse, R47;                               /* 0x5980178004a7452f */
+        /*0418*/                   FFMA R41, R70.reuse, R74.reuse, R41;                         /* 0x5980148004a74629 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0428*/                   FFMA R40, R70, R75.reuse, R40;                               /* 0x5980140004b74628 */
+        /*0430*/                   FFMA R42, R68.reuse, R75.reuse, R42;                         /* 0x5980150004b7442a */
+        /*0438*/                   FFMA R43, R68, R74.reuse, R43;                               /* 0x5980158004a7442b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0448*/                   FFMA R13, R67.reuse, R74.reuse, R13;                         /* 0x5980068004a7430d */
+        /*0450*/                   FFMA R12, R67, R75.reuse, R12;                               /* 0x5980060004b7430c */
+        /*0458*/                   FFMA R14, R65.reuse, R75.reuse, R14;                         /* 0x5980070004b7410e */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0468*/                   FFMA R15, R65, R74.reuse, R15;                               /* 0x5980078004a7410f */
+        /*0470*/                   FFMA R9, R66.reuse, R74.reuse, R9;                           /* 0x5980048004a74209 */
+        /*0478*/                   FFMA R8, R66.reuse, R75.reuse, R8;                           /* 0x5980040004b74208 */
+                                                                                                /* 0x101fc440fe0207f1 */
+        /*0488*/                   FFMA R10, R64.reuse, R75, R10;                               /* 0x5980050004b7400a */
+        /*0490*/         {         FFMA R11, R64.reuse, R74, R11;                               /* 0x5980058004a7400b */
+        /*0498*/               @P0 TLD.B.LZ.P R96, R112, R113, 0x0, 1D, 0xf;        }           /* 0xdd38000787107060 */
+                                                                                                /* 0x101cc480fe0607e1 */
+        /*04a8*/                   FFMA R17, R66.reuse, R76.reuse, R17;                         /* 0x5980088004c74211 */
+        /*04b0*/         {         FFMA R16, R66, R77.reuse, R16;                               /* 0x5980080004d74210 */
+        /*04b8*/               @P0 TLD.B.LZ.P R100, R116, R113, 0x0, 1D, 0xf;        }          /* 0xdd38000787107464 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*04c8*/                   FFMA R18, R64.reuse, R77.reuse, R18;                         /* 0x5980090004d74012 */
+        /*04d0*/                   FFMA R19, R64, R76.reuse, R19;                               /* 0x5980098004c74013 */
+        /*04d8*/                   FFMA R21, R67.reuse, R76.reuse, R21;                         /* 0x59800a8004c74315 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*04e8*/                   FFMA R20, R67, R77.reuse, R20;                               /* 0x59800a0004d74314 */
+        /*04f0*/                   FFMA R22, R65.reuse, R77.reuse, R22;                         /* 0x59800b0004d74116 */
+        /*04f8*/                   FFMA R23, R65, R76.reuse, R23;                               /* 0x59800b8004c74117 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0508*/                   FFMA R49, R70.reuse, R76.reuse, R49;                         /* 0x5980188004c74631 */
+        /*0510*/                   FFMA R48, R70, R77.reuse, R48;                               /* 0x5980180004d74630 */
+        /*0518*/                   FFMA R50, R68.reuse, R77.reuse, R50;                         /* 0x5980190004d74432 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0528*/                   FFMA R51, R68, R76.reuse, R51;                               /* 0x5980198004c74433 */
+        /*0530*/                   FFMA R53, R71.reuse, R76.reuse, R53;                         /* 0x59801a8004c74735 */
+        /*0538*/                   FFMA R52, R71.reuse, R77.reuse, R52;                         /* 0x59801a0004d74734 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*0548*/                   FFMA R54, R69.reuse, R77, R54;                               /* 0x59801b0004d74536 */
+        /*0550*/                   FFMA R55, R69.reuse, R76, R55;                               /* 0x59801b8004c74537 */
+        /*0558*/                   FFMA R61, R71.reuse, R78.reuse, R61;                         /* 0x59801e8004e7473d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0568*/                   FFMA R60, R71, R79.reuse, R60;                               /* 0x59801e0004f7473c */
+        /*0570*/                   FFMA R62, R69.reuse, R79.reuse, R62;                         /* 0x59801f0004f7453e */
+        /*0578*/                   FFMA R63, R69, R78.reuse, R63;                               /* 0x59801f8004e7453f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0588*/                   FFMA R57, R70.reuse, R78.reuse, R57;                         /* 0x59801c8004e74639 */
+        /*0590*/                   FFMA R56, R70, R79.reuse, R56;                               /* 0x59801c0004f74638 */
+        /*0598*/                   FFMA R58, R68.reuse, R79.reuse, R58;                         /* 0x59801d0004f7443a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*05a8*/                   FFMA R59, R68, R78.reuse, R59;                               /* 0x59801d8004e7443b */
+        /*05b0*/                   FFMA R29, R67.reuse, R78.reuse, R29;                         /* 0x59800e8004e7431d */
+        /*05b8*/                   FFMA R28, R67, R79.reuse, R28;                               /* 0x59800e0004f7431c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*05c8*/                   FFMA R30, R65.reuse, R79.reuse, R30;                         /* 0x59800f0004f7411e */
+        /*05d0*/                   FFMA R31, R65, R78.reuse, R31;                               /* 0x59800f8004e7411f */
+        /*05d8*/                   FFMA R25, R66.reuse, R78.reuse, R25;                         /* 0x59800c8004e74219 */
+                                                                                                /* 0x001fc440fe2407f1 */
+        /*05e8*/                   FFMA R24, R66, R79.reuse, R24;                               /* 0x59800c0004f74218 */
+        /*05f0*/                   FFMA R26, R64.reuse, R79, R26;                               /* 0x59800d0004f7401a */
+        /*05f8*/                   FFMA R27, R64, R78, R27;                                     /* 0x59800d8004e7401b */
+                                                                                                /* 0x101fc400fe260ff0 */
+        /*0608*/         {         FFMA R1, R82.reuse, R88.reuse, R1;                           /* 0x5980008005875201 */
+        /*0610*/                   LDS.U.128 R64, [R114+0x200];        }                        /* 0xef4e100020077240 */
+        /*0618*/                   FFMA R0, R82, R89.reuse, R0;                                 /* 0x5980000005975200 */
+                                                                                                /* 0x101fc400fe2607f0 */
+        /*0628*/         {         FFMA R2, R80.reuse, R89.reuse, R2;                           /* 0x5980010005975002 */
+        /*0630*/                   LDS.U.128 R72, [R115+0x200];        }                        /* 0xef4e100020077348 */
+        /*0638*/                   FFMA R3, R80, R88.reuse, R3;                                 /* 0x5980018005875003 */
+                                                                                                /* 0x101fc400fe2607f0 */
+        /*0648*/         {         FFMA R5, R83.reuse, R88.reuse, R5;                           /* 0x5980028005875305 */
+        /*0650*/                   LDS.U.128 R68, [R114+0x280];        }                        /* 0xef4e100028077244 */
+        /*0658*/                   FFMA R4, R83, R89.reuse, R4;                                 /* 0x5980020005975304 */
+                                                                                                /* 0x101fc400e22607f0 */
+        /*0668*/         {         FFMA R6, R81.reuse, R89.reuse, R6;                           /* 0x5980030005975106 */
+        /*0670*/                   LDS.U.128 R76, [R115+0x280];        }                        /* 0xef4e10002807734c */
+        /*0678*/                   FFMA R7, R81, R88.reuse, R7;                                 /* 0x5980038005875107 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0688*/                   FFMA R33, R86.reuse, R88.reuse, R33;                         /* 0x5980108005875621 */
+        /*0690*/                   FFMA R32, R86, R89.reuse, R32;                               /* 0x5980100005975620 */
+        /*0698*/                   FFMA R34, R84.reuse, R89.reuse, R34;                         /* 0x5980110005975422 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*06a8*/                   FFMA R35, R84, R88.reuse, R35;                               /* 0x5980118005875423 */
+        /*06b0*/                   FFMA R37, R87.reuse, R88.reuse, R37;                         /* 0x5980128005875725 */
+        /*06b8*/                   FFMA R36, R87.reuse, R89.reuse, R36;                         /* 0x5980120005975724 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*06c8*/                   FFMA R38, R85.reuse, R89, R38;                               /* 0x5980130005975526 */
+        /*06d0*/                   FFMA R39, R85.reuse, R88, R39;                               /* 0x5980138005875527 */
+        /*06d8*/                   FFMA R45, R87.reuse, R90.reuse, R45;                         /* 0x5980168005a7572d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*06e8*/                   FFMA R44, R87, R91.reuse, R44;                               /* 0x5980160005b7572c */
+        /*06f0*/                   FFMA R46, R85.reuse, R91.reuse, R46;                         /* 0x5980170005b7552e */
+        /*06f8*/                   FFMA R47, R85, R90.reuse, R47;                               /* 0x5980178005a7552f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0708*/                   FFMA R41, R86.reuse, R90.reuse, R41;                         /* 0x5980148005a75629 */
+        /*0710*/                   FFMA R40, R86, R91.reuse, R40;                               /* 0x5980140005b75628 */
+        /*0718*/                   FFMA R42, R84.reuse, R91.reuse, R42;                         /* 0x5980150005b7542a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0728*/                   FFMA R43, R84, R90.reuse, R43;                               /* 0x5980158005a7542b */
+        /*0730*/                   FFMA R13, R83.reuse, R90.reuse, R13;                         /* 0x5980068005a7530d */
+        /*0738*/                   FFMA R12, R83, R91.reuse, R12;                               /* 0x5980060005b7530c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0748*/                   FFMA R14, R81.reuse, R91.reuse, R14;                         /* 0x5980070005b7510e */
+        /*0750*/                   FFMA R15, R81, R90.reuse, R15;                               /* 0x5980078005a7510f */
+        /*0758*/                   FFMA R9, R82.reuse, R90.reuse, R9;                           /* 0x5980048005a75209 */
+                                                                                                /* 0x081fc040fe2607f1 */
+        /*0768*/                   FFMA R8, R82.reuse, R91.reuse, R8;                           /* 0x5980040005b75208 */
+        /*0770*/                   FFMA R10, R80.reuse, R91, R10;                               /* 0x5980050005b7500a */
+        /*0778*/         {         FFMA R11, R80.reuse, R90, R11;                               /* 0x5980058005a7500b */
+        /*0788*/               @P0 TLD.B.LZ.P R104, R120, R113, 0x0, 1D, 0xf;        }          /* 0x101fc0c0fc2407f1 */
+                                                                                                /* 0xdd38000787107868 */
+        /*0790*/                   FFMA R17, R82.reuse, R92.reuse, R17;                         /* 0x5980088005c75211 */
+        /*0798*/         {         FFMA R16, R82, R93.reuse, R16;                               /* 0x5980080005d75210 */
+        /*07a8*/               @P0 TLD.B.LZ.P R108, R124, R113, 0x0, 1D, 0xf;        }          /* 0x101fc4c0fe240751 */
+                                                                                                /* 0xdd38000787107c6c */
+        /*07b0*/                   FFMA R18, R80.reuse, R93.reuse, R18;                         /* 0x5980090005d75012 */
+        /*07b8*/                   FFMA R19, R80, R92.reuse, R19;                               /* 0x5980098005c75013 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*07c8*/                   FFMA R21, R83.reuse, R92.reuse, R21;                         /* 0x59800a8005c75315 */
+        /*07d0*/                   FFMA R20, R83, R93.reuse, R20;                               /* 0x59800a0005d75314 */
+        /*07d8*/                   FFMA R22, R81.reuse, R93.reuse, R22;                         /* 0x59800b0005d75116 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*07e8*/                   FFMA R23, R81, R92.reuse, R23;                               /* 0x59800b8005c75117 */
+        /*07f0*/                   FFMA R49, R86.reuse, R92.reuse, R49;                         /* 0x5980188005c75631 */
+        /*07f8*/                   FFMA R48, R86, R93.reuse, R48;                               /* 0x5980180005d75630 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0808*/                   FFMA R50, R84.reuse, R93.reuse, R50;                         /* 0x5980190005d75432 */
+        /*0810*/                   FFMA R51, R84, R92.reuse, R51;                               /* 0x5980198005c75433 */
+        /*0818*/                   FFMA R53, R87.reuse, R92.reuse, R53;                         /* 0x59801a8005c75735 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*0828*/                   FFMA R52, R87.reuse, R93.reuse, R52;                         /* 0x59801a0005d75734 */
+        /*0830*/                   FFMA R54, R85.reuse, R93, R54;                               /* 0x59801b0005d75536 */
+        /*0838*/                   FFMA R55, R85.reuse, R92, R55;                               /* 0x59801b8005c75537 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0848*/                   FFMA R61, R87.reuse, R94.reuse, R61;                         /* 0x59801e8005e7573d */
+        /*0850*/                   FFMA R60, R87, R95.reuse, R60;                               /* 0x59801e0005f7573c */
+        /*0858*/                   FFMA R62, R85.reuse, R95.reuse, R62;                         /* 0x59801f0005f7553e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0868*/                   FFMA R63, R85, R94.reuse, R63;                               /* 0x59801f8005e7553f */
+        /*0870*/                   FFMA R57, R86.reuse, R94.reuse, R57;                         /* 0x59801c8005e75639 */
+        /*0878*/                   FFMA R56, R86, R95.reuse, R56;                               /* 0x59801c0005f75638 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0888*/                   FFMA R58, R84.reuse, R95.reuse, R58;                         /* 0x59801d0005f7543a */
+        /*0890*/                   FFMA R59, R84, R94.reuse, R59;                               /* 0x59801d8005e7543b */
+        /*0898*/                   FFMA R29, R83.reuse, R94.reuse, R29;                         /* 0x59800e8005e7531d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*08a8*/                   FFMA R28, R83, R95.reuse, R28;                               /* 0x59800e0005f7531c */
+        /*08b0*/                   FFMA R30, R81.reuse, R95.reuse, R30;                         /* 0x59800f0005f7511e */
+        /*08b8*/                   FFMA R31, R81, R94.reuse, R31;                               /* 0x59800f8005e7511f */
+                                                                                                /* 0x081fc480fe2607f1 */
+        /*08c8*/                   FFMA R25, R82.reuse, R94.reuse, R25;                         /* 0x59800c8005e75219 */
+        /*08d0*/                   FFMA R24, R82, R95.reuse, R24;                               /* 0x59800c0005f75218 */
+        /*08d8*/                   FFMA R26, R80.reuse, R95, R26;                               /* 0x59800d0005f7501a */
+                                                                                                /* 0x001fc4c1fe0007f1 */
+        /*08e8*/                   FFMA R27, R80, R94, R27;                                     /* 0x59800d8005e7501b */
+        /*08f0*/         {         FFMA R1, R66.reuse, R72.reuse, R1;                           /* 0x5980008004874201 */
+        /*08f8*/                   LDS.U.128 R80, [R114+0x300];        }                        /* 0xef4e100030077250 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*0908*/                   FFMA R0, R66, R73.reuse, R0;                                 /* 0x5980000004974200 */
+        /*0910*/         {         FFMA R2, R64.reuse, R73.reuse, R2;                           /* 0x5980010004974002 */
+        /*0918*/                   LDS.U.128 R88, [R115+0x300];        }                        /* 0xef4e100030077358 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*0928*/                   FFMA R3, R64, R72.reuse, R3;                                 /* 0x5980018004874003 */
+        /*0930*/         {         FFMA R5, R67.reuse, R72.reuse, R5;                           /* 0x5980028004874305 */
+        /*0938*/                   LDS.U.128 R84, [R114+0x380];        }                        /* 0xef4e100038077254 */
+                                                                                                /* 0x001c44c0fe0407f1 */
+        /*0948*/                   FFMA R4, R67, R73.reuse, R4;                                 /* 0x5980020004974304 */
+        /*0950*/         {         FFMA R6, R65.reuse, R73.reuse, R6;                           /* 0x5980030004974106 */
+        /*0958*/                   LDS.U.128 R92, [R115+0x380];        }                        /* 0xef4e10003807735c */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0968*/                   FFMA R7, R65, R72.reuse, R7;                                 /* 0x5980038004874107 */
+        /*0970*/                   FFMA R33, R70.reuse, R72.reuse, R33;                         /* 0x5980108004874621 */
+        /*0978*/                   FFMA R32, R70, R73.reuse, R32;                               /* 0x5980100004974620 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0988*/                   FFMA R34, R68.reuse, R73.reuse, R34;                         /* 0x5980110004974422 */
+        /*0990*/                   FFMA R35, R68, R72.reuse, R35;                               /* 0x5980118004874423 */
+        /*0998*/                   FFMA R37, R71.reuse, R72.reuse, R37;                         /* 0x5980128004874725 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*09a8*/                   FFMA R36, R71.reuse, R73.reuse, R36;                         /* 0x5980120004974724 */
+        /*09b0*/                   FFMA R38, R69.reuse, R73, R38;                               /* 0x5980130004974526 */
+        /*09b8*/                   FFMA R39, R69.reuse, R72, R39;                               /* 0x5980138004874527 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*09c8*/                   FFMA R45, R71.reuse, R74.reuse, R45;                         /* 0x5980168004a7472d */
+        /*09d0*/                   FFMA R44, R71, R75.reuse, R44;                               /* 0x5980160004b7472c */
+        /*09d8*/                   FFMA R46, R69.reuse, R75.reuse, R46;                         /* 0x5980170004b7452e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*09e8*/                   FFMA R47, R69, R74.reuse, R47;                               /* 0x5980178004a7452f */
+        /*09f0*/                   FFMA R41, R70.reuse, R74.reuse, R41;                         /* 0x5980148004a74629 */
+        /*09f8*/                   FFMA R40, R70, R75.reuse, R40;                               /* 0x5980140004b74628 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0a08*/                   FFMA R42, R68.reuse, R75.reuse, R42;                         /* 0x5980150004b7442a */
+        /*0a10*/                   FFMA R43, R68, R74.reuse, R43;                               /* 0x5980158004a7442b */
+        /*0a18*/                   FFMA R13, R67.reuse, R74.reuse, R13;                         /* 0x5980068004a7430d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0a28*/                   FFMA R12, R67, R75.reuse, R12;                               /* 0x5980060004b7430c */
+        /*0a30*/                   FFMA R14, R65.reuse, R75.reuse, R14;                         /* 0x5980070004b7410e */
+        /*0a38*/                   FFMA R15, R65, R74.reuse, R15;                               /* 0x5980078004a7410f */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*0a48*/                   FFMA R9, R66.reuse, R74.reuse, R9;                           /* 0x5980048004a74209 */
+        /*0a50*/                   FFMA R8, R66.reuse, R75.reuse, R8;                           /* 0x5980040004b74208 */
+        /*0a58*/                   FFMA R10, R64.reuse, R75, R10;                               /* 0x5980050004b7400a */
+                                                                                                /* 0x101fc4c0fc2207f1 */
+        /*0a68*/                   FFMA R11, R64.reuse, R74, R11;                               /* 0x5980058004a7400b */
+        /*0a70*/                   FFMA R17, R66.reuse, R76.reuse, R17;                         /* 0x5980088004c74211 */
+        /*0a78*/                   FFMA R16, R66, R77.reuse, R16;                               /* 0x5980080004d74210 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0a88*/                   FFMA R18, R64.reuse, R77.reuse, R18;                         /* 0x5980090004d74012 */
+        /*0a90*/                   FFMA R19, R64, R76.reuse, R19;                               /* 0x5980098004c74013 */
+        /*0a98*/                   FFMA R21, R67.reuse, R76.reuse, R21;                         /* 0x59800a8004c74315 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0aa8*/                   FFMA R20, R67, R77.reuse, R20;                               /* 0x59800a0004d74314 */
+        /*0ab0*/                   FFMA R22, R65.reuse, R77.reuse, R22;                         /* 0x59800b0004d74116 */
+        /*0ab8*/                   FFMA R23, R65, R76.reuse, R23;                               /* 0x59800b8004c74117 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0ac8*/                   FFMA R49, R70.reuse, R76.reuse, R49;                         /* 0x5980188004c74631 */
+        /*0ad0*/                   FFMA R48, R70, R77.reuse, R48;                               /* 0x5980180004d74630 */
+        /*0ad8*/                   FFMA R50, R68.reuse, R77.reuse, R50;                         /* 0x5980190004d74432 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0ae8*/                   FFMA R51, R68, R76.reuse, R51;                               /* 0x5980198004c74433 */
+        /*0af0*/                   FFMA R53, R71.reuse, R76.reuse, R53;                         /* 0x59801a8004c74735 */
+        /*0af8*/                   FFMA R52, R71.reuse, R77.reuse, R52;                         /* 0x59801a0004d74734 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*0b08*/                   FFMA R54, R69.reuse, R77, R54;                               /* 0x59801b0004d74536 */
+        /*0b10*/                   FFMA R55, R69.reuse, R76, R55;                               /* 0x59801b8004c74537 */
+        /*0b18*/                   FFMA R61, R71.reuse, R78.reuse, R61;                         /* 0x59801e8004e7473d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0b28*/                   FFMA R60, R71, R79.reuse, R60;                               /* 0x59801e0004f7473c */
+        /*0b30*/                   FFMA R62, R69.reuse, R79.reuse, R62;                         /* 0x59801f0004f7453e */
+        /*0b38*/                   FFMA R63, R69, R78.reuse, R63;                               /* 0x59801f8004e7453f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0b48*/                   FFMA R57, R70.reuse, R78.reuse, R57;                         /* 0x59801c8004e74639 */
+        /*0b50*/                   FFMA R56, R70, R79.reuse, R56;                               /* 0x59801c0004f74638 */
+        /*0b58*/                   FFMA R58, R68.reuse, R79.reuse, R58;                         /* 0x59801d0004f7443a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0b68*/                   FFMA R59, R68, R78.reuse, R59;                               /* 0x59801d8004e7443b */
+        /*0b70*/                   FFMA R29, R67.reuse, R78.reuse, R29;                         /* 0x59800e8004e7431d */
+        /*0b78*/                   FFMA R28, R67, R79.reuse, R28;                               /* 0x59800e0004f7431c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0b88*/                   FFMA R30, R65.reuse, R79.reuse, R30;                         /* 0x59800f0004f7411e */
+        /*0b90*/                   FFMA R31, R65, R78.reuse, R31;                               /* 0x59800f8004e7411f */
+        /*0b98*/                   FFMA R25, R66.reuse, R78.reuse, R25;                         /* 0x59800c8004e74219 */
+                                                                                                /* 0x001fc440fe2407f1 */
+        /*0ba8*/                   FFMA R24, R66, R79.reuse, R24;                               /* 0x59800c0004f74218 */
+        /*0bb0*/                   FFMA R26, R64.reuse, R79, R26;                               /* 0x59800d0004f7401a */
+        /*0bb8*/                   FFMA R27, R64, R78, R27;                                     /* 0x59800d8004e7401b */
+                                                                                                /* 0x101fc400fe260ff0 */
+        /*0bc8*/         {         FFMA R1, R82.reuse, R88.reuse, R1;                           /* 0x5980008005875201 */
+        /*0bd0*/                   LDS.U.128 R64, [R114+0x400];        }                        /* 0xef4e100040077240 */
+        /*0bd8*/                   FFMA R0, R82, R89.reuse, R0;                                 /* 0x5980000005975200 */
+                                                                                                /* 0x101fc400fe2607f0 */
+        /*0be8*/         {         FFMA R2, R80.reuse, R89.reuse, R2;                           /* 0x5980010005975002 */
+        /*0bf0*/                   LDS.U.128 R72, [R115+0x400];        }                        /* 0xef4e100040077348 */
+        /*0bf8*/                   FFMA R3, R80, R88.reuse, R3;                                 /* 0x5980018005875003 */
+                                                                                                /* 0x101fc400fe2607f0 */
+        /*0c08*/         {         FFMA R5, R83.reuse, R88.reuse, R5;                           /* 0x5980028005875305 */
+        /*0c10*/                   LDS.U.128 R68, [R114+0x480];        }                        /* 0xef4e100048077244 */
+        /*0c18*/                   FFMA R4, R83, R89.reuse, R4;                                 /* 0x5980020005975304 */
+                                                                                                /* 0x101fc400e22607f0 */
+        /*0c28*/         {         FFMA R6, R81.reuse, R89.reuse, R6;                           /* 0x5980030005975106 */
+        /*0c30*/                   LDS.U.128 R76, [R115+0x480];        }                        /* 0xef4e10004807734c */
+        /*0c38*/                   FFMA R7, R81, R88.reuse, R7;                                 /* 0x5980038005875107 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0c48*/                   FFMA R33, R86.reuse, R88.reuse, R33;                         /* 0x5980108005875621 */
+        /*0c50*/                   FFMA R32, R86, R89.reuse, R32;                               /* 0x5980100005975620 */
+        /*0c58*/                   FFMA R34, R84.reuse, R89.reuse, R34;                         /* 0x5980110005975422 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0c68*/                   FFMA R35, R84, R88.reuse, R35;                               /* 0x5980118005875423 */
+        /*0c70*/                   FFMA R37, R87.reuse, R88.reuse, R37;                         /* 0x5980128005875725 */
+        /*0c78*/                   FFMA R36, R87.reuse, R89.reuse, R36;                         /* 0x5980120005975724 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*0c88*/                   FFMA R38, R85.reuse, R89, R38;                               /* 0x5980130005975526 */
+        /*0c90*/                   FFMA R39, R85.reuse, R88, R39;                               /* 0x5980138005875527 */
+        /*0c98*/                   FFMA R45, R87.reuse, R90.reuse, R45;                         /* 0x5980168005a7572d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0ca8*/                   FFMA R44, R87, R91.reuse, R44;                               /* 0x5980160005b7572c */
+        /*0cb0*/                   FFMA R46, R85.reuse, R91.reuse, R46;                         /* 0x5980170005b7552e */
+        /*0cb8*/                   FFMA R47, R85, R90.reuse, R47;                               /* 0x5980178005a7552f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0cc8*/                   FFMA R41, R86.reuse, R90.reuse, R41;                         /* 0x5980148005a75629 */
+        /*0cd0*/                   FFMA R40, R86, R91.reuse, R40;                               /* 0x5980140005b75628 */
+        /*0cd8*/                   FFMA R42, R84.reuse, R91.reuse, R42;                         /* 0x5980150005b7542a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0ce8*/                   FFMA R43, R84, R90.reuse, R43;                               /* 0x5980158005a7542b */
+        /*0cf0*/                   FFMA R13, R83.reuse, R90.reuse, R13;                         /* 0x5980068005a7530d */
+        /*0cf8*/                   FFMA R12, R83, R91.reuse, R12;                               /* 0x5980060005b7530c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0d08*/                   FFMA R14, R81.reuse, R91.reuse, R14;                         /* 0x5980070005b7510e */
+        /*0d10*/                   FFMA R15, R81, R90.reuse, R15;                               /* 0x5980078005a7510f */
+        /*0d18*/                   FFMA R9, R82.reuse, R90.reuse, R9;                           /* 0x5980048005a75209 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*0d28*/                   FFMA R8, R82.reuse, R91.reuse, R8;                           /* 0x5980040005b75208 */
+        /*0d30*/                   FFMA R10, R80.reuse, R91, R10;                               /* 0x5980050005b7500a */
+        /*0d38*/                   FFMA R11, R80.reuse, R90, R11;                               /* 0x5980058005a7500b */
+                                                                                                /* 0x181fc480fe2607e1 */
+        /*0d48*/                   FFMA R17, R82.reuse, R92.reuse, R17;                         /* 0x5980088005c75211 */
+        /*0d50*/                   FFMA R16, R82, R93.reuse, R16;                               /* 0x5980080005d75210 */
+        /*0d58*/                   FFMA R18, R80.reuse, R93.reuse, R18;                         /* 0x5980090005d75012 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0d68*/                   FFMA R19, R80, R92.reuse, R19;                               /* 0x5980098005c75013 */
+        /*0d70*/                   FFMA R21, R83.reuse, R92.reuse, R21;                         /* 0x59800a8005c75315 */
+        /*0d78*/                   FFMA R20, R83, R93.reuse, R20;                               /* 0x59800a0005d75314 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0d88*/                   FFMA R22, R81.reuse, R93.reuse, R22;                         /* 0x59800b0005d75116 */
+        /*0d90*/                   FFMA R23, R81, R92.reuse, R23;                               /* 0x59800b8005c75117 */
+        /*0d98*/                   FFMA R49, R86.reuse, R92.reuse, R49;                         /* 0x5980188005c75631 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0da8*/                   FFMA R48, R86, R93.reuse, R48;                               /* 0x5980180005d75630 */
+        /*0db0*/                   FFMA R50, R84.reuse, R93.reuse, R50;                         /* 0x5980190005d75432 */
+        /*0db8*/                   FFMA R51, R84, R92.reuse, R51;                               /* 0x5980198005c75433 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*0dc8*/                   FFMA R53, R87.reuse, R92.reuse, R53;                         /* 0x59801a8005c75735 */
+        /*0dd0*/                   FFMA R52, R87.reuse, R93.reuse, R52;                         /* 0x59801a0005d75734 */
+        /*0dd8*/                   FFMA R54, R85.reuse, R93, R54;                               /* 0x59801b0005d75536 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*0de8*/                   FFMA R55, R85.reuse, R92, R55;                               /* 0x59801b8005c75537 */
+        /*0df0*/                   FFMA R61, R87.reuse, R94.reuse, R61;                         /* 0x59801e8005e7573d */
+        /*0df8*/                   FFMA R60, R87, R95.reuse, R60;                               /* 0x59801e0005f7573c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0e08*/                   FFMA R62, R85.reuse, R95.reuse, R62;                         /* 0x59801f0005f7553e */
+        /*0e10*/                   FFMA R63, R85, R94.reuse, R63;                               /* 0x59801f8005e7553f */
+        /*0e18*/                   FFMA R57, R86.reuse, R94.reuse, R57;                         /* 0x59801c8005e75639 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0e28*/                   FFMA R56, R86, R95.reuse, R56;                               /* 0x59801c0005f75638 */
+        /*0e30*/                   FFMA R58, R84.reuse, R95.reuse, R58;                         /* 0x59801d0005f7543a */
+        /*0e38*/                   FFMA R59, R84, R94.reuse, R59;                               /* 0x59801d8005e7543b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0e48*/                   FFMA R29, R83.reuse, R94.reuse, R29;                         /* 0x59800e8005e7531d */
+        /*0e50*/                   FFMA R28, R83, R95.reuse, R28;                               /* 0x59800e0005f7531c */
+        /*0e58*/                   FFMA R30, R81.reuse, R95.reuse, R30;                         /* 0x59800f0005f7511e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0e68*/                   FFMA R31, R81, R94.reuse, R31;                               /* 0x59800f8005e7511f */
+        /*0e70*/                   FFMA R25, R82.reuse, R94.reuse, R25;                         /* 0x59800c8005e75219 */
+        /*0e78*/                   FFMA R24, R82, R95.reuse, R24;                               /* 0x59800c0005f75218 */
+                                                                                                /* 0x183fc000fe2207f1 */
+        /*0e88*/                   FFMA R26, R80.reuse, R95, R26;                               /* 0x59800d0005f7501a */
+        /*0e90*/                   FFMA R27, R80, R94, R27;                                     /* 0x59800d8005e7501b */
+        /*0e98*/         {         FFMA R1, R66.reuse, R72.reuse, R1;                           /* 0x5980008004874201 */
+        /*0ea8*/                   LDS.U.128 R80, [R114+0x500];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100050077250 */
+        /*0eb0*/                   FFMA R0, R66, R73.reuse, R0;                                 /* 0x5980000004974200 */
+        /*0eb8*/         {         FFMA R2, R64.reuse, R73.reuse, R2;                           /* 0x5980010004974002 */
+        /*0ec8*/                   LDS.U.128 R88, [R115+0x500];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100050077358 */
+        /*0ed0*/                   FFMA R3, R64, R72.reuse, R3;                                 /* 0x5980018004874003 */
+        /*0ed8*/         {         FFMA R5, R67.reuse, R72.reuse, R5;                           /* 0x5980028004874305 */
+        /*0ee8*/                   LDS.U.128 R84, [R114+0x580];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100058077254 */
+        /*0ef0*/                   FFMA R4, R67, R73.reuse, R4;                                 /* 0x5980020004974304 */
+        /*0ef8*/         {         FFMA R6, R65.reuse, R73.reuse, R6;                           /* 0x5980030004974106 */
+        /*0f08*/                   LDS.U.128 R92, [R115+0x580];        }                        /* 0x181fc480fe200711 */
+                                                                                                /* 0xef4e10005807735c */
+        /*0f10*/                   FFMA R7, R65, R72.reuse, R7;                                 /* 0x5980038004874107 */
+        /*0f18*/                   FFMA R33, R70.reuse, R72.reuse, R33;                         /* 0x5980108004874621 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0f28*/                   FFMA R32, R70, R73.reuse, R32;                               /* 0x5980100004974620 */
+        /*0f30*/                   FFMA R34, R68.reuse, R73.reuse, R34;                         /* 0x5980110004974422 */
+        /*0f38*/                   FFMA R35, R68, R72.reuse, R35;                               /* 0x5980118004874423 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*0f48*/                   FFMA R37, R71.reuse, R72.reuse, R37;                         /* 0x5980128004874725 */
+        /*0f50*/                   FFMA R36, R71.reuse, R73.reuse, R36;                         /* 0x5980120004974724 */
+        /*0f58*/                   FFMA R38, R69.reuse, R73, R38;                               /* 0x5980130004974526 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*0f68*/                   FFMA R39, R69.reuse, R72, R39;                               /* 0x5980138004874527 */
+        /*0f70*/                   FFMA R45, R71.reuse, R74.reuse, R45;                         /* 0x5980168004a7472d */
+        /*0f78*/                   FFMA R44, R71, R75.reuse, R44;                               /* 0x5980160004b7472c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0f88*/                   FFMA R46, R69.reuse, R75.reuse, R46;                         /* 0x5980170004b7452e */
+        /*0f90*/                   FFMA R47, R69, R74.reuse, R47;                               /* 0x5980178004a7452f */
+        /*0f98*/                   FFMA R41, R70.reuse, R74.reuse, R41;                         /* 0x5980148004a74629 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*0fa8*/                   FFMA R40, R70, R75.reuse, R40;                               /* 0x5980140004b74628 */
+        /*0fb0*/                   FFMA R42, R68.reuse, R75.reuse, R42;                         /* 0x5980150004b7442a */
+        /*0fb8*/                   FFMA R43, R68, R74.reuse, R43;                               /* 0x5980158004a7442b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*0fc8*/                   FFMA R13, R67.reuse, R74.reuse, R13;                         /* 0x5980068004a7430d */
+        /*0fd0*/                   FFMA R12, R67, R75.reuse, R12;                               /* 0x5980060004b7430c */
+        /*0fd8*/                   FFMA R14, R65.reuse, R75.reuse, R14;                         /* 0x5980070004b7410e */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*0fe8*/                   FFMA R15, R65, R74.reuse, R15;                               /* 0x5980078004a7410f */
+        /*0ff0*/                   FFMA R9, R66.reuse, R74.reuse, R9;                           /* 0x5980048004a74209 */
+        /*0ff8*/                   FFMA R8, R66.reuse, R75.reuse, R8;                           /* 0x5980040004b74208 */
+                                                                                                /* 0x181f8440fe2207f1 */
+        /*1008*/                   FFMA R10, R64.reuse, R75, R10;                               /* 0x5980050004b7400a */
+        /*1010*/                   FFMA R11, R64.reuse, R74, R11;                               /* 0x5980058004a7400b */
+        /*1018*/                   FFMA R17, R66.reuse, R76.reuse, R17;                         /* 0x5980088004c74211 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1028*/                   FFMA R16, R66, R77.reuse, R16;                               /* 0x5980080004d74210 */
+        /*1030*/                   FFMA R18, R64.reuse, R77.reuse, R18;                         /* 0x5980090004d74012 */
+        /*1038*/                   FFMA R19, R64, R76.reuse, R19;                               /* 0x5980098004c74013 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1048*/                   FFMA R21, R67.reuse, R76.reuse, R21;                         /* 0x59800a8004c74315 */
+        /*1050*/                   FFMA R20, R67, R77.reuse, R20;                               /* 0x59800a0004d74314 */
+        /*1058*/                   FFMA R22, R65.reuse, R77.reuse, R22;                         /* 0x59800b0004d74116 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1068*/                   FFMA R23, R65, R76.reuse, R23;                               /* 0x59800b8004c74117 */
+        /*1070*/                   FFMA R49, R70.reuse, R76.reuse, R49;                         /* 0x5980188004c74631 */
+        /*1078*/                   FFMA R48, R70, R77.reuse, R48;                               /* 0x5980180004d74630 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1088*/                   FFMA R50, R68.reuse, R77.reuse, R50;                         /* 0x5980190004d74432 */
+        /*1090*/                   FFMA R51, R68, R76.reuse, R51;                               /* 0x5980198004c74433 */
+        /*1098*/                   FFMA R53, R71.reuse, R76.reuse, R53;                         /* 0x59801a8004c74735 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*10a8*/                   FFMA R52, R71.reuse, R77.reuse, R52;                         /* 0x59801a0004d74734 */
+        /*10b0*/                   FFMA R54, R69.reuse, R77, R54;                               /* 0x59801b0004d74536 */
+        /*10b8*/                   FFMA R55, R69.reuse, R76, R55;                               /* 0x59801b8004c74537 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*10c8*/                   FFMA R61, R71.reuse, R78.reuse, R61;                         /* 0x59801e8004e7473d */
+        /*10d0*/                   FFMA R60, R71, R79.reuse, R60;                               /* 0x59801e0004f7473c */
+        /*10d8*/                   FFMA R62, R69.reuse, R79.reuse, R62;                         /* 0x59801f0004f7453e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*10e8*/                   FFMA R63, R69, R78.reuse, R63;                               /* 0x59801f8004e7453f */
+        /*10f0*/                   FFMA R57, R70.reuse, R78.reuse, R57;                         /* 0x59801c8004e74639 */
+        /*10f8*/                   FFMA R56, R70, R79.reuse, R56;                               /* 0x59801c0004f74638 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1108*/                   FFMA R58, R68.reuse, R79.reuse, R58;                         /* 0x59801d0004f7443a */
+        /*1110*/                   FFMA R59, R68, R78.reuse, R59;                               /* 0x59801d8004e7443b */
+        /*1118*/                   FFMA R29, R67.reuse, R78.reuse, R29;                         /* 0x59800e8004e7431d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1128*/                   FFMA R28, R67, R79.reuse, R28;                               /* 0x59800e0004f7431c */
+        /*1130*/                   FFMA R30, R65.reuse, R79.reuse, R30;                         /* 0x59800f0004f7411e */
+        /*1138*/                   FFMA R31, R65, R78.reuse, R31;                               /* 0x59800f8004e7411f */
+                                                                                                /* 0x081fc480fe2607f1 */
+        /*1148*/                   FFMA R25, R66.reuse, R78.reuse, R25;                         /* 0x59800c8004e74219 */
+        /*1150*/                   FFMA R24, R66, R79.reuse, R24;                               /* 0x59800c0004f74218 */
+        /*1158*/                   FFMA R26, R64.reuse, R79, R26;                               /* 0x59800d0004f7401a */
+                                                                                                /* 0x001fc4c1fe0007f1 */
+        /*1168*/                   FFMA R27, R64, R78, R27;                                     /* 0x59800d8004e7401b */
+        /*1170*/         {         FFMA R1, R82.reuse, R88.reuse, R1;                           /* 0x5980008005875201 */
+        /*1178*/                   LDS.U.128 R64, [R114+0x600];        }                        /* 0xef4e100060077240 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*1188*/                   FFMA R0, R82, R89.reuse, R0;                                 /* 0x5980000005975200 */
+        /*1190*/         {         FFMA R2, R80.reuse, R89.reuse, R2;                           /* 0x5980010005975002 */
+        /*1198*/                   LDS.U.128 R72, [R115+0x600];        }                        /* 0xef4e100060077348 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*11a8*/                   FFMA R3, R80, R88.reuse, R3;                                 /* 0x5980018005875003 */
+        /*11b0*/         {         FFMA R5, R83.reuse, R88.reuse, R5;                           /* 0x5980028005875305 */
+        /*11b8*/                   LDS.U.128 R68, [R114+0x680];        }                        /* 0xef4e100068077244 */
+                                                                                                /* 0x001c44c0fe0407f1 */
+        /*11c8*/                   FFMA R4, R83, R89.reuse, R4;                                 /* 0x5980020005975304 */
+        /*11d0*/         {         FFMA R6, R81.reuse, R89.reuse, R6;                           /* 0x5980030005975106 */
+        /*11d8*/                   LDS.U.128 R76, [R115+0x680];        }                        /* 0xef4e10006807734c */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*11e8*/                   FFMA R7, R81, R88.reuse, R7;                                 /* 0x5980038005875107 */
+        /*11f0*/                   FFMA R33, R86.reuse, R88.reuse, R33;                         /* 0x5980108005875621 */
+        /*11f8*/                   FFMA R32, R86, R89.reuse, R32;                               /* 0x5980100005975620 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1208*/                   FFMA R34, R84.reuse, R89.reuse, R34;                         /* 0x5980110005975422 */
+        /*1210*/                   FFMA R35, R84, R88.reuse, R35;                               /* 0x5980118005875423 */
+        /*1218*/                   FFMA R37, R87.reuse, R88.reuse, R37;                         /* 0x5980128005875725 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*1228*/                   FFMA R36, R87.reuse, R89.reuse, R36;                         /* 0x5980120005975724 */
+        /*1230*/                   FFMA R38, R85.reuse, R89, R38;                               /* 0x5980130005975526 */
+        /*1238*/                   FFMA R39, R85.reuse, R88, R39;                               /* 0x5980138005875527 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1248*/                   FFMA R45, R87.reuse, R90.reuse, R45;                         /* 0x5980168005a7572d */
+        /*1250*/                   FFMA R44, R87, R91.reuse, R44;                               /* 0x5980160005b7572c */
+        /*1258*/                   FFMA R46, R85.reuse, R91.reuse, R46;                         /* 0x5980170005b7552e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1268*/                   FFMA R47, R85, R90.reuse, R47;                               /* 0x5980178005a7552f */
+        /*1270*/                   FFMA R41, R86.reuse, R90.reuse, R41;                         /* 0x5980148005a75629 */
+        /*1278*/                   FFMA R40, R86, R91.reuse, R40;                               /* 0x5980140005b75628 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1288*/                   FFMA R42, R84.reuse, R91.reuse, R42;                         /* 0x5980150005b7542a */
+        /*1290*/                   FFMA R43, R84, R90.reuse, R43;                               /* 0x5980158005a7542b */
+        /*1298*/                   FFMA R13, R83.reuse, R90.reuse, R13;                         /* 0x5980068005a7530d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*12a8*/                   FFMA R12, R83, R91.reuse, R12;                               /* 0x5980060005b7530c */
+        /*12b0*/                   FFMA R14, R81.reuse, R91.reuse, R14;                         /* 0x5980070005b7510e */
+        /*12b8*/                   FFMA R15, R81, R90.reuse, R15;                               /* 0x5980078005a7510f */
+                                                                                                /* 0x081fc0c0fe2607f1 */
+        /*12c8*/                   FFMA R9, R82.reuse, R90.reuse, R9;                           /* 0x5980048005a75209 */
+        /*12d0*/                   FFMA R8, R82.reuse, R91.reuse, R8;                           /* 0x5980040005b75208 */
+        /*12d8*/         {         FFMA R10, R80.reuse, R91, R10;                               /* 0x5980050005b7500a */
+        /*12e8*/               @P0 STS.128 [R118], R96;        }                                /* 0x181f8440fe2017f1 */
+                                                                                                /* 0xef5e000000007660 */
+        /*12f0*/                   FFMA R11, R80.reuse, R90, R11;                               /* 0x5980058005a7500b */
+        /*12f8*/                   FFMA R17, R82.reuse, R92.reuse, R17;                         /* 0x5980088005c75211 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*1308*/                   FFMA R16, R82, R93.reuse, R16;                               /* 0x5980080005d75210 */
+        /*1310*/         {         FFMA R18, R80.reuse, R93.reuse, R18;                         /* 0x5980090005d75012 */
+        /*1318*/               @P0 STS.128 [R118+0x200], R100;        }                         /* 0xef5e000020007664 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1328*/                   FFMA R19, R80, R92.reuse, R19;                               /* 0x5980098005c75013 */
+        /*1330*/                   FFMA R21, R83.reuse, R92.reuse, R21;                         /* 0x59800a8005c75315 */
+        /*1338*/                   FFMA R20, R83, R93.reuse, R20;                               /* 0x59800a0005d75314 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1348*/                   FFMA R22, R81.reuse, R93.reuse, R22;                         /* 0x59800b0005d75116 */
+        /*1350*/                   FFMA R23, R81, R92.reuse, R23;                               /* 0x59800b8005c75117 */
+        /*1358*/                   FFMA R49, R86.reuse, R92.reuse, R49;                         /* 0x5980188005c75631 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1368*/                   FFMA R48, R86, R93.reuse, R48;                               /* 0x5980180005d75630 */
+        /*1370*/                   FFMA R50, R84.reuse, R93.reuse, R50;                         /* 0x5980190005d75432 */
+        /*1378*/                   FFMA R51, R84, R92.reuse, R51;                               /* 0x5980198005c75433 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*1388*/                   FFMA R53, R87.reuse, R92.reuse, R53;                         /* 0x59801a8005c75735 */
+        /*1390*/                   FFMA R52, R87.reuse, R93.reuse, R52;                         /* 0x59801a0005d75734 */
+        /*1398*/                   FFMA R54, R85.reuse, R93, R54;                               /* 0x59801b0005d75536 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*13a8*/                   FFMA R55, R85.reuse, R92, R55;                               /* 0x59801b8005c75537 */
+        /*13b0*/                   FFMA R61, R87.reuse, R94.reuse, R61;                         /* 0x59801e8005e7573d */
+        /*13b8*/                   FFMA R60, R87, R95.reuse, R60;                               /* 0x59801e0005f7573c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*13c8*/                   FFMA R62, R85.reuse, R95.reuse, R62;                         /* 0x59801f0005f7553e */
+        /*13d0*/                   FFMA R63, R85, R94.reuse, R63;                               /* 0x59801f8005e7553f */
+        /*13d8*/                   FFMA R57, R86.reuse, R94.reuse, R57;                         /* 0x59801c8005e75639 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*13e8*/                   FFMA R56, R86, R95.reuse, R56;                               /* 0x59801c0005f75638 */
+        /*13f0*/                   FFMA R58, R84.reuse, R95.reuse, R58;                         /* 0x59801d0005f7543a */
+        /*13f8*/                   FFMA R59, R84, R94.reuse, R59;                               /* 0x59801d8005e7543b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1408*/                   FFMA R29, R83.reuse, R94.reuse, R29;                         /* 0x59800e8005e7531d */
+        /*1410*/                   FFMA R28, R83, R95.reuse, R28;                               /* 0x59800e0005f7531c */
+        /*1418*/                   FFMA R30, R81.reuse, R95.reuse, R30;                         /* 0x59800f0005f7511e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1428*/                   FFMA R31, R81, R94.reuse, R31;                               /* 0x59800f8005e7511f */
+        /*1430*/                   FFMA R25, R82.reuse, R94.reuse, R25;                         /* 0x59800c8005e75219 */
+        /*1438*/                   FFMA R24, R82, R95.reuse, R24;                               /* 0x59800c0005f75218 */
+                                                                                                /* 0x183fc000fe2207f1 */
+        /*1448*/                   FFMA R26, R80.reuse, R95, R26;                               /* 0x59800d0005f7501a */
+        /*1450*/                   FFMA R27, R80, R94, R27;                                     /* 0x59800d8005e7501b */
+        /*1458*/         {         FFMA R1, R66.reuse, R72.reuse, R1;                           /* 0x5980008004874201 */
+        /*1468*/                   LDS.U.128 R80, [R114+0x700];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100070077250 */
+        /*1470*/                   FFMA R0, R66, R73.reuse, R0;                                 /* 0x5980000004974200 */
+        /*1478*/         {         FFMA R2, R64.reuse, R73.reuse, R2;                           /* 0x5980010004974002 */
+        /*1488*/                   LDS.U.128 R88, [R115+0x700];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100070077358 */
+        /*1490*/                   FFMA R3, R64, R72.reuse, R3;                                 /* 0x5980018004874003 */
+        /*1498*/         {         FFMA R5, R67.reuse, R72.reuse, R5;                           /* 0x5980028004874305 */
+        /*14a8*/                   LDS.U.128 R84, [R114+0x780];        }                        /* 0x181fc080fe2007f1 */
+                                                                                                /* 0xef4e100078077254 */
+        /*14b0*/                   FFMA R4, R67, R73.reuse, R4;                                 /* 0x5980020004974304 */
+        /*14b8*/         {         FFMA R6, R65.reuse, R73.reuse, R6;                           /* 0x5980030004974106 */
+        /*14c8*/                   LDS.U.128 R92, [R115+0x780];        }                        /* 0x181fc480fe200711 */
+                                                                                                /* 0xef4e10007807735c */
+        /*14d0*/                   FFMA R7, R65, R72.reuse, R7;                                 /* 0x5980038004874107 */
+        /*14d8*/                   FFMA R33, R70.reuse, R72.reuse, R33;                         /* 0x5980108004874621 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*14e8*/                   FFMA R32, R70, R73.reuse, R32;                               /* 0x5980100004974620 */
+        /*14f0*/                   FFMA R34, R68.reuse, R73.reuse, R34;                         /* 0x5980110004974422 */
+        /*14f8*/                   FFMA R35, R68, R72.reuse, R35;                               /* 0x5980118004874423 */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*1508*/                   FFMA R37, R71.reuse, R72.reuse, R37;                         /* 0x5980128004874725 */
+        /*1510*/                   FFMA R36, R71.reuse, R73.reuse, R36;                         /* 0x5980120004974724 */
+        /*1518*/                   FFMA R38, R69.reuse, R73, R38;                               /* 0x5980130004974526 */
+                                                                                                /* 0x101fc4c0fe2207f1 */
+        /*1528*/                   FFMA R39, R69.reuse, R72, R39;                               /* 0x5980138004874527 */
+        /*1530*/                   FFMA R45, R71.reuse, R74.reuse, R45;                         /* 0x5980168004a7472d */
+        /*1538*/                   FFMA R44, R71, R75.reuse, R44;                               /* 0x5980160004b7472c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1548*/                   FFMA R46, R69.reuse, R75.reuse, R46;                         /* 0x5980170004b7452e */
+        /*1550*/                   FFMA R47, R69, R74.reuse, R47;                               /* 0x5980178004a7452f */
+        /*1558*/                   FFMA R41, R70.reuse, R74.reuse, R41;                         /* 0x5980148004a74629 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1568*/                   FFMA R40, R70, R75.reuse, R40;                               /* 0x5980140004b74628 */
+        /*1570*/                   FFMA R42, R68.reuse, R75.reuse, R42;                         /* 0x5980150004b7442a */
+        /*1578*/                   FFMA R43, R68, R74.reuse, R43;                               /* 0x5980158004a7442b */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1588*/                   FFMA R13, R67.reuse, R74.reuse, R13;                         /* 0x5980068004a7430d */
+        /*1590*/                   FFMA R12, R67, R75.reuse, R12;                               /* 0x5980060004b7430c */
+        /*1598*/                   FFMA R14, R65.reuse, R75.reuse, R14;                         /* 0x5980070004b7410e */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*15a8*/                   FFMA R15, R65, R74.reuse, R15;                               /* 0x5980078004a7410f */
+        /*15b0*/                   FFMA R9, R66.reuse, R74.reuse, R9;                           /* 0x5980048004a74209 */
+        /*15b8*/                   FFMA R8, R66.reuse, R75.reuse, R8;                           /* 0x5980040004b74208 */
+                                                                                                /* 0x081fc404fe2207f0 */
+        /*15c8*/         {         FFMA R10, R64.reuse, R75, R10;                               /* 0x5980050004b7400a */
+        /*15d0*/               @P0 STS.128 [R118+0x400], R104;        }                         /* 0xef5e000040007668 */
+        /*15d8*/                   FFMA R11, R64.reuse, R74, R11;                               /* 0x5980058004a7400b */
+                                                                                                /* 0x181fc080fe2607e1 */
+        /*15e8*/                   FFMA R17, R66.reuse, R76.reuse, R17;                         /* 0x5980088004c74211 */
+        /*15f0*/                   FFMA R16, R66, R77.reuse, R16;                               /* 0x5980080004d74210 */
+        /*15f8*/         {         FFMA R18, R64.reuse, R77.reuse, R18;                         /* 0x5980090004d74012 */
+        /*1608*/               @P0 STS.128 [R118+0x600], R108;        }                         /* 0x181fc480fe2007f1 */
+                                                                                                /* 0xef5e00006000766c */
+        /*1610*/                   FFMA R19, R64, R76.reuse, R19;                               /* 0x5980098004c74013 */
+        /*1618*/                   FFMA R21, R67.reuse, R76.reuse, R21;                         /* 0x59800a8004c74315 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1628*/                   FFMA R20, R67, R77.reuse, R20;                               /* 0x59800a0004d74314 */
+        /*1630*/                   FFMA R22, R65.reuse, R77.reuse, R22;                         /* 0x59800b0004d74116 */
+        /*1638*/                   FFMA R23, R65, R76.reuse, R23;                               /* 0x59800b8004c74117 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1648*/                   FFMA R49, R70.reuse, R76.reuse, R49;                         /* 0x5980188004c74631 */
+        /*1650*/                   FFMA R48, R70, R77.reuse, R48;                               /* 0x5980180004d74630 */
+        /*1658*/                   FFMA R50, R68.reuse, R77.reuse, R50;                         /* 0x5980190004d74432 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*1668*/                   FFMA R51, R68, R76.reuse, R51;                               /* 0x5980198004c74433 */
+        /*1670*/                   FFMA R53, R71.reuse, R76.reuse, R53;                         /* 0x59801a8004c74735 */
+        /*1678*/                   FFMA R52, R71.reuse, R77.reuse, R52;                         /* 0x59801a0004d74734 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*1688*/                   FFMA R54, R69.reuse, R77, R54;                               /* 0x59801b0004d74536 */
+        /*1690*/                   FFMA R55, R69.reuse, R76, R55;                               /* 0x59801b8004c74537 */
+        /*1698*/                   FFMA R61, R71.reuse, R78.reuse, R61;                         /* 0x59801e8004e7473d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*16a8*/                   FFMA R60, R71, R79.reuse, R60;                               /* 0x59801e0004f7473c */
+        /*16b0*/                   FFMA R62, R69.reuse, R79.reuse, R62;                         /* 0x59801f0004f7453e */
+        /*16b8*/                   FFMA R63, R69, R78.reuse, R63;                               /* 0x59801f8004e7453f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*16c8*/                   FFMA R57, R70.reuse, R78.reuse, R57;                         /* 0x59801c8004e74639 */
+        /*16d0*/                   FFMA R56, R70, R79.reuse, R56;                               /* 0x59801c0004f74638 */
+        /*16d8*/                   FFMA R58, R68.reuse, R79.reuse, R58;                         /* 0x59801d0004f7443a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*16e8*/                   FFMA R59, R68, R78.reuse, R59;                               /* 0x59801d8004e7443b */
+        /*16f0*/                   FFMA R29, R67.reuse, R78.reuse, R29;                         /* 0x59800e8004e7431d */
+        /*16f8*/                   FFMA R28, R67, R79.reuse, R28;                               /* 0x59800e0004f7431c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1708*/                   FFMA R30, R65.reuse, R79.reuse, R30;                         /* 0x59800f0004f7411e */
+        /*1710*/                   FFMA R31, R65, R78.reuse, R31;                               /* 0x59800f8004e7411f */
+        /*1718*/                   FFMA R25, R66.reuse, R78.reuse, R25;                         /* 0x59800c8004e74219 */
+                                                                                                /* 0x003fd440fe0407f1 */
+        /*1728*/                   FFMA R24, R66, R79.reuse, R24;                               /* 0x59800c0004f74218 */
+        /*1730*/         {         FFMA R26, R64.reuse, R79, R26;                               /* 0x59800d0004f7401a */
+        /*1738*/                   BAR.SYNC 0x0;        }                                       /* 0xf0a81b8000070000 */
+                                                                                                /* 0x001fc400fe2007f1 */
+        /*1748*/               @P0 LOP.XOR R114, R114, 0x1000;                                  /* 0x3847040100007272 */
+        /*1750*/               @P0 LOP.XOR R115, R115, 0x1000;                                  /* 0x3847040100007373 */
+        /*1758*/               @P0 LOP.XOR R118, R118, 0x1000;                                  /* 0x3847040100007676 */
+                                                                                                /* 0x001fc4c0fe0007f1 */
+        /*1768*/                   FFMA R27, R64, R78, R27;                                     /* 0x59800d8004e7401b */
+        /*1770*/         {         FFMA R1, R82.reuse, R88.reuse, R1;                           /* 0x5980008005875201 */
+        /*1778*/               @P0 LDS.U.128 R64, [R114];        }                              /* 0xef4e100000007240 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*1788*/                   FFMA R0, R82, R89.reuse, R0;                                 /* 0x5980000005975200 */
+        /*1790*/         {         FFMA R2, R80.reuse, R89.reuse, R2;                           /* 0x5980010005975002 */
+        /*1798*/               @P0 LDS.U.128 R72, [R115];        }                              /* 0xef4e100000007348 */
+                                                                                                /* 0x001fc4c0fe0407f1 */
+        /*17a8*/                   FFMA R3, R80, R88.reuse, R3;                                 /* 0x5980018005875003 */
+        /*17b0*/         {         FFMA R5, R83.reuse, R88.reuse, R5;                           /* 0x5980028005875305 */
+        /*17b8*/               @P0 LDS.U.128 R68, [R114+0x80];        }                         /* 0xef4e100008007244 */
+                                                                                                /* 0x001c44c0fe0407f1 */
+        /*17c8*/                   FFMA R4, R83, R89.reuse, R4;                                 /* 0x5980020005975304 */
+        /*17d0*/         {         FFMA R6, R81.reuse, R89.reuse, R6;                           /* 0x5980030005975106 */
+        /*17d8*/               @P0 LDS.U.128 R76, [R115+0x80];        }                         /* 0xef4e10000800734c */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*17e8*/                   FFMA R7, R81, R88.reuse, R7;                                 /* 0x5980038005875107 */
+        /*17f0*/                   FFMA R33, R86.reuse, R88.reuse, R33;                         /* 0x5980108005875621 */
+        /*17f8*/                   FFMA R32, R86, R89.reuse, R32;                               /* 0x5980100005975620 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1808*/                   FFMA R34, R84.reuse, R89.reuse, R34;                         /* 0x5980110005975422 */
+        /*1810*/                   FFMA R35, R84, R88.reuse, R35;                               /* 0x5980118005875423 */
+        /*1818*/                   FFMA R37, R87.reuse, R88.reuse, R37;                         /* 0x5980128005875725 */
+                                                                                                /* 0x081fc440fe2607f1 */
+        /*1828*/                   FFMA R36, R87.reuse, R89.reuse, R36;                         /* 0x5980120005975724 */
+        /*1830*/                   FFMA R38, R85.reuse, R89, R38;                               /* 0x5980130005975526 */
+        /*1838*/                   FFMA R39, R85.reuse, R88, R39;                               /* 0x5980138005875527 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1848*/                   FFMA R45, R87.reuse, R90.reuse, R45;                         /* 0x5980168005a7572d */
+        /*1850*/                   FFMA R44, R87, R91.reuse, R44;                               /* 0x5980160005b7572c */
+        /*1858*/                   FFMA R46, R85.reuse, R91.reuse, R46;                         /* 0x5980170005b7552e */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1868*/                   FFMA R47, R85, R90.reuse, R47;                               /* 0x5980178005a7552f */
+        /*1870*/                   FFMA R41, R86.reuse, R90.reuse, R41;                         /* 0x5980148005a75629 */
+        /*1878*/                   FFMA R40, R86, R91.reuse, R40;                               /* 0x5980140005b75628 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1888*/                   FFMA R42, R84.reuse, R91.reuse, R42;                         /* 0x5980150005b7542a */
+        /*1890*/                   FFMA R43, R84, R90.reuse, R43;                               /* 0x5980158005a7542b */
+        /*1898*/                   FFMA R13, R83.reuse, R90.reuse, R13;                         /* 0x5980068005a7530d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*18a8*/                   FFMA R12, R83, R91.reuse, R12;                               /* 0x5980060005b7530c */
+        /*18b0*/                   FFMA R14, R81.reuse, R91.reuse, R14;                         /* 0x5980070005b7510e */
+        /*18b8*/                   FFMA R15, R81, R90.reuse, R15;                               /* 0x5980078005a7510f */
+                                                                                                /* 0x081fc4c0fe2607f1 */
+        /*18c8*/                   FFMA R9, R82.reuse, R90.reuse, R9;                           /* 0x5980048005a75209 */
+        /*18d0*/                   FFMA R8, R82.reuse, R91.reuse, R8;                           /* 0x5980040005b75208 */
+        /*18d8*/                   FFMA R10, R80.reuse, R91, R10;                               /* 0x5980050005b7500a */
+                                                                                                /* 0x101fc4c0fc2207f1 */
+        /*18e8*/                   FFMA R11, R80.reuse, R90, R11;                               /* 0x5980058005a7500b */
+        /*18f0*/                   FFMA R17, R82.reuse, R92.reuse, R17;                         /* 0x5980088005c75211 */
+        /*18f8*/                   FFMA R16, R82, R93.reuse, R16;                               /* 0x5980080005d75210 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1908*/                   FFMA R18, R80.reuse, R93.reuse, R18;                         /* 0x5980090005d75012 */
+        /*1910*/                   FFMA R19, R80, R92.reuse, R19;                               /* 0x5980098005c75013 */
+        /*1918*/                   FFMA R21, R83.reuse, R92.reuse, R21;                         /* 0x59800a8005c75315 */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*1928*/                   FFMA R20, R83, R93.reuse, R20;                               /* 0x59800a0005d75314 */
+        /*1930*/                   FFMA R22, R81.reuse, R93.reuse, R22;                         /* 0x59800b0005d75116 */
+        /*1938*/                   FFMA R23, R81, R92.reuse, R23;                               /* 0x59800b8005c75117 */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1948*/                   FFMA R49, R86.reuse, R92.reuse, R49;                         /* 0x5980188005c75631 */
+        /*1950*/                   FFMA R48, R86, R93.reuse, R48;                               /* 0x5980180005d75630 */
+        /*1958*/                   FFMA R50, R84.reuse, R93.reuse, R50;                         /* 0x5980190005d75432 */
+                                                                                                /* 0x181fc4c0fe2407f1 */
+        /*1968*/                   FFMA R51, R84, R92.reuse, R51;                               /* 0x5980198005c75433 */
+        /*1970*/                   FFMA R53, R87.reuse, R92.reuse, R53;                         /* 0x59801a8005c75735 */
+        /*1978*/                   FFMA R52, R87.reuse, R93.reuse, R52;                         /* 0x59801a0005d75734 */
+                                                                                                /* 0x181fc440fe2207f1 */
+        /*1988*/                   FFMA R54, R85.reuse, R93, R54;                               /* 0x59801b0005d75536 */
+        /*1990*/                   FFMA R55, R85.reuse, R92, R55;                               /* 0x59801b8005c75537 */
+        /*1998*/                   FFMA R61, R87.reuse, R94.reuse, R61;                         /* 0x59801e8005e7573d */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*19a8*/                   FFMA R60, R87, R95.reuse, R60;                               /* 0x59801e0005f7573c */
+        /*19b0*/                   FFMA R62, R85.reuse, R95.reuse, R62;                         /* 0x59801f0005f7553e */
+        /*19b8*/                   FFMA R63, R85, R94.reuse, R63;                               /* 0x59801f8005e7553f */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*19c8*/                   FFMA R57, R86.reuse, R94.reuse, R57;                         /* 0x59801c8005e75639 */
+        /*19d0*/                   FFMA R56, R86, R95.reuse, R56;                               /* 0x59801c0005f75638 */
+        /*19d8*/                   FFMA R58, R84.reuse, R95.reuse, R58;                         /* 0x59801d0005f7543a */
+                                                                                                /* 0x101fc4c0fe2407f1 */
+        /*19e8*/                   FFMA R59, R84, R94.reuse, R59;                               /* 0x59801d8005e7543b */
+        /*19f0*/                   FFMA R29, R83.reuse, R94.reuse, R29;                         /* 0x59800e8005e7531d */
+        /*19f8*/                   FFMA R28, R83, R95.reuse, R28;                               /* 0x59800e0005f7531c */
+                                                                                                /* 0x181fc480fe2607f1 */
+        /*1a08*/                   FFMA R30, R81.reuse, R95.reuse, R30;                         /* 0x59800f0005f7511e */
+        /*1a10*/                   FFMA R31, R81, R94.reuse, R31;                               /* 0x59800f8005e7511f */
+        /*1a18*/                   FFMA R25, R82.reuse, R94.reuse, R25;                         /* 0x59800c8005e75219 */
+                                                                                                /* 0x001fc440fe2407f1 */
+        /*1a28*/                   FFMA R24, R82, R95.reuse, R24;                               /* 0x59800c0005f75218 */
+        /*1a30*/                   FFMA R26, R80.reuse, R95, R26;                               /* 0x59800d0005f7501a */
+        /*1a38*/                   FFMA R27, R80, R94, R27;                                     /* 0x59800d8005e7501b */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1a48*/               @P0 IADD R112, R112, R121.reuse;                                 /* 0x5c10000007907070 */
+        /*1a50*/               @P0 IADD R116, R116, R121.reuse;                                 /* 0x5c10000007907474 */
+        /*1a58*/               @P0 IADD R120, R120, R121.reuse;                                 /* 0x5c10000007907878 */
+                                                                                                /* 0x081fc400fca007f0 */
+        /*1a68*/         {     @P0 IADD R124, R124, R121;                                       /* 0x5c10000007907c7c */
+        /*1a70*/               @P0 BRA 0x310;        }                                          /* 0xe2400ffe8980000f */
+        /*1a78*/                   SHR.U32 R80, R123.reuse, 0x1;                                /* 0x3828000000177b50 */
+                                                                                                /* 0x001fc480fe2007f1 */
+        /*1a88*/                   MOV R81, c[0x0][0x158];                                      /* 0x4c98078005670051 */
+        /*1a90*/                   ISCADD R84, R125, R126.reuse, 0x6;                           /* 0x5c18030007e77d54 */
+        /*1a98*/                   MOV R72, c[0x0][0x15c];                                      /* 0x4c98078005770048 */
+                                                                                                /* 0x001fc400fe2007f1 */
+        /*1aa8*/                   ISCADD R92, R123, R126, 0x3;                                 /* 0x5c18018007e77b5c */
+        /*1ab0*/                   LOP.AND R114, R114, 0x7ff;                                   /* 0x384700007ff77272 */
+        /*1ab8*/                   ISCADD R80, R122, R80, 0x6;                                  /* 0x5c18030005077a50 */
+                                                                                                /* 0x001fc440fe2007f1 */
+        /*1ac8*/                   LOP.AND R115, R115, 0x7ff;                                   /* 0x384700007ff77373 */
+        /*1ad0*/                   SHL R77, R81.reuse, 0x2;                                     /* 0x384800000027514d */
+        /*1ad8*/                   ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT;                 /* 0x4b6303800517542f */
+                                                                                                /* 0x081fc400fe2207f1 */
+        /*1ae8*/                   SHL R89, R81.reuse, 0x4;                                     /* 0x3848000000475159 */
+        /*1af0*/                   FMUL R64, R3, R72;                                           /* 0x5c68000004870340 */
+        /*1af8*/                   SHL R91, R81.reuse, 0x5;                                     /* 0x384800000057515b */
+                                                                                                /* 0x001fc400fe2607f1 */
+        /*1b08*/                   XMAD.MRG R74, R80.reuse, R81.H1.reuse, RZ;                   /* 0x5b007fa80517504a */
+        /*1b10*/                   ISCADD R93, R115, R114, 0x4;                                 /* 0x5c1802000727735d */
+        /*1b18*/                   XMAD R73, R80, R81, R84;                                     /* 0x5b002a0005175049 */
+                                                                                                /* 0x001fc400fe2007f1 */
+        /*1b28*/                   SHL R92, R92, 0x2;                                           /* 0x3848000000275c5c */
+        /*1b30*/                   IADD R84, R84, 0x20;                                         /* 0x3810000002075454 */
+        /*1b38*/                   ISCADD R85, R81, -R89, 0x7;                                  /* 0x5c19038005975155 */
+                                                                                                /* 0x001fc480fe2407f1 */
+        /*1b48*/                   FMUL R65, R7, R72.reuse;                                     /* 0x5c68000004870741 */
+        /*1b50*/                   FMUL R66, R1, R72.reuse;                                     /* 0x5c68000004870142 */
+        /*1b58*/                   XMAD.PSL.CBCC R73, R80.H1, R74.H1, R73;                      /* 0x5b30249804a75049 */
+                                                                                                /* 0x101fc400fe2007f1 */
+        /*1b68*/                   IADD R80, R80, -0x1;                                         /* 0x3910007ffff75050 */
+        /*1b70*/                   ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT;                 /* 0x4b63038005175437 */
+        /*1b78*/                   FMUL R67, R5, R72.reuse;                                     /* 0x5c68000004870543 */
+                                                                                                /* 0x001fc480fe2407f1 */
+        /*1b88*/                   FMUL R68, R35, R72.reuse;                                    /* 0x5c68000004872344 */
+        /*1b90*/                   FMUL R69, R39, R72.reuse;                                    /* 0x5c68000004872745 */
+        /*1b98*/                   ISCADD R76, R73, c[0x0][0x140], 0x2;                         /* 0x4c1801000507494c */
+                                                                                                /* 0x001fc440fe2207f1 */
+        /*1ba8*/                   IADD R86, R80.reuse, 0x4;                                    /* 0x3810000000475056 */
+        /*1bb0*/                   IADD R87, R80.reuse, 0x8;                                    /* 0x3810000000875057 */
+        /*1bb8*/                   IADD R88, R80, 0xc;                                          /* 0x3810000000c75058 */
+                                                                                                /* 0x001f9800fe2407f1 */
+        /*1bc8*/                   FMUL R70, R33, R72.reuse;                                    /* 0x5c68000004872146 */
+        /*1bd0*/                   FMUL R71, R37, R72;                                          /* 0x5c68000004872547 */
+        /*1bd8*/                   IADD R76, R76, -R77;                                         /* 0x5c11000004d74c4c */
+                                                                                                /* 0x001fc080fca207f1 */
+        /*1be8*/                   IADD R75, R76.reuse, R89;                                    /* 0x5c10000005974c4b */
+        /*1bf0*/                   IADD R78, R76, R91.reuse;                                    /* 0x5c10000005b74c4e */
+        /*1bf8*/         {         IADD R79, R75, R91;                                          /* 0x5c10000005b74b4f */
+        /*1c08*/                   CAL 0x1f10;        }                                         /* 0x101fc482fe2007f5 */
+                                                                                                /* 0xe260000030000040 */
+        /*1c10*/                   FMUL R64, R2, R72.reuse;                                     /* 0x5c68000004870240 */
+        /*1c18*/                   FMUL R65, R6, R72.reuse;                                     /* 0x5c68000004870641 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1c28*/                   FMUL R66, R0, R72.reuse;                                     /* 0x5c68000004870042 */
+        /*1c30*/                   FMUL R67, R4, R72.reuse;                                     /* 0x5c68000004870443 */
+        /*1c38*/                   FMUL R68, R34, R72.reuse;                                    /* 0x5c68000004872244 */
+                                                                                                /* 0x001fc080fe2407f1 */
+        /*1c48*/                   FMUL R69, R38, R72.reuse;                                    /* 0x5c68000004872645 */
+        /*1c50*/                   FMUL R70, R32, R72.reuse;                                    /* 0x5c68000004872046 */
+        /*1c58*/         {         FMUL R71, R36, R72;                                          /* 0x5c68000004872447 */
+        /*1c68*/                   CAL 0x1f10;        }                                         /* 0x101fc482fe2007f5 */
+                                                                                                /* 0xe26000002a000040 */
+        /*1c70*/                   FMUL R64, R11, R72.reuse;                                    /* 0x5c68000004870b40 */
+        /*1c78*/                   FMUL R65, R15, R72.reuse;                                    /* 0x5c68000004870f41 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1c88*/                   FMUL R66, R9, R72.reuse;                                     /* 0x5c68000004870942 */
+        /*1c90*/                   FMUL R67, R13, R72.reuse;                                    /* 0x5c68000004870d43 */
+        /*1c98*/                   FMUL R68, R43, R72.reuse;                                    /* 0x5c68000004872b44 */
+                                                                                                /* 0x001fc080fe2407f1 */
+        /*1ca8*/                   FMUL R69, R47, R72.reuse;                                    /* 0x5c68000004872f45 */
+        /*1cb0*/                   FMUL R70, R41, R72.reuse;                                    /* 0x5c68000004872946 */
+        /*1cb8*/         {         FMUL R71, R45, R72;                                          /* 0x5c68000004872d47 */
+        /*1cc8*/                   CAL 0x1f10;        }                                         /* 0x101fc482fe2007f5 */
+                                                                                                /* 0xe260000024000040 */
+        /*1cd0*/                   FMUL R64, R10, R72.reuse;                                    /* 0x5c68000004870a40 */
+        /*1cd8*/                   FMUL R65, R14, R72.reuse;                                    /* 0x5c68000004870e41 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1ce8*/                   FMUL R66, R8, R72.reuse;                                     /* 0x5c68000004870842 */
+        /*1cf0*/                   FMUL R67, R12, R72.reuse;                                    /* 0x5c68000004870c43 */
+        /*1cf8*/                   FMUL R68, R42, R72.reuse;                                    /* 0x5c68000004872a44 */
+                                                                                                /* 0x001fc080fe2407f1 */
+        /*1d08*/                   FMUL R69, R46, R72.reuse;                                    /* 0x5c68000004872e45 */
+        /*1d10*/                   FMUL R70, R40, R72.reuse;                                    /* 0x5c68000004872846 */
+        /*1d18*/         {         FMUL R71, R44, R72;                                          /* 0x5c68000004872c47 */
+        /*1d28*/                   CAL 0x1f10;        }                                         /* 0x001fc400fe2007f5 */
+                                                                                                /* 0xe26000001e000040 */
+        /*1d30*/                   IADD R80, R80, 0x1c;                                         /* 0x3810000001c75050 */
+        /*1d38*/                   IADD R86, R86, 0x1c;                                         /* 0x3810000001c75656 */
+                                                                                                /* 0x105fc400fe2007f1 */
+        /*1d48*/                   IADD R87, R87, 0x1c;                                         /* 0x3810000001c75757 */
+        /*1d50*/                   IADD R88, R88, 0x1c;                                         /* 0x3810000001c75858 */
+        /*1d58*/                   IADD R76, R76, R85.reuse;                                    /* 0x5c10000005574c4c */
+                                                                                                /* 0x001fc480fe2407f1 */
+        /*1d68*/                   IADD R75, R75, R85.reuse;                                    /* 0x5c10000005574b4b */
+        /*1d70*/                   IADD R78, R78, R85.reuse;                                    /* 0x5c10000005574e4e */
+        /*1d78*/                   IADD R79, R79, R85;                                          /* 0x5c10000005574f4f */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1d88*/                   FMUL R64, R19, R72.reuse;                                    /* 0x5c68000004871340 */
+        /*1d90*/                   FMUL R65, R23, R72.reuse;                                    /* 0x5c68000004871741 */
+        /*1d98*/                   FMUL R66, R17, R72.reuse;                                    /* 0x5c68000004871142 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1da8*/                   FMUL R67, R21, R72.reuse;                                    /* 0x5c68000004871543 */
+        /*1db0*/                   FMUL R68, R51, R72.reuse;                                    /* 0x5c68000004873344 */
+        /*1db8*/                   FMUL R69, R55, R72.reuse;                                    /* 0x5c68000004873745 */
+                                                                                                /* 0x001fd400fe0407f1 */
+        /*1dc8*/                   FMUL R70, R49, R72.reuse;                                    /* 0x5c68000004873146 */
+        /*1dd0*/         {         FMUL R71, R53, R72;                                          /* 0x5c68000004873547 */
+        /*1dd8*/                   CAL 0x1f10;        }                                         /* 0xe260000013000040 */
+                                                                                                /* 0x101fc480fe2417f1 */
+        /*1de8*/                   FMUL R64, R18, R72.reuse;                                    /* 0x5c68000004871240 */
+        /*1df0*/                   FMUL R65, R22, R72.reuse;                                    /* 0x5c68000004871641 */
+        /*1df8*/                   FMUL R66, R16, R72.reuse;                                    /* 0x5c68000004871042 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1e08*/                   FMUL R67, R20, R72.reuse;                                    /* 0x5c68000004871443 */
+        /*1e10*/                   FMUL R68, R50, R72.reuse;                                    /* 0x5c68000004873244 */
+        /*1e18*/                   FMUL R69, R54, R72.reuse;                                    /* 0x5c68000004873645 */
+                                                                                                /* 0x001fd400fe0407f1 */
+        /*1e28*/                   FMUL R70, R48, R72.reuse;                                    /* 0x5c68000004873046 */
+        /*1e30*/         {         FMUL R71, R52, R72;                                          /* 0x5c68000004873447 */
+        /*1e38*/                   CAL 0x1f10;        }                                         /* 0xe26000000d000040 */
+                                                                                                /* 0x101fc480fe2417f1 */
+        /*1e48*/                   FMUL R64, R27, R72.reuse;                                    /* 0x5c68000004871b40 */
+        /*1e50*/                   FMUL R65, R31, R72.reuse;                                    /* 0x5c68000004871f41 */
+        /*1e58*/                   FMUL R66, R25, R72.reuse;                                    /* 0x5c68000004871942 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1e68*/                   FMUL R67, R29, R72.reuse;                                    /* 0x5c68000004871d43 */
+        /*1e70*/                   FMUL R68, R59, R72.reuse;                                    /* 0x5c68000004873b44 */
+        /*1e78*/                   FMUL R69, R63, R72.reuse;                                    /* 0x5c68000004873f45 */
+                                                                                                /* 0x001fd400fe0407f1 */
+        /*1e88*/                   FMUL R70, R57, R72.reuse;                                    /* 0x5c68000004873946 */
+        /*1e90*/         {         FMUL R71, R61, R72;                                          /* 0x5c68000004873d47 */
+        /*1e98*/                   CAL 0x1f10;        }                                         /* 0xe260000007000040 */
+                                                                                                /* 0x101fc480fe2417f1 */
+        /*1ea8*/                   FMUL R64, R26, R72.reuse;                                    /* 0x5c68000004871a40 */
+        /*1eb0*/                   FMUL R65, R30, R72.reuse;                                    /* 0x5c68000004871e41 */
+        /*1eb8*/                   FMUL R66, R24, R72.reuse;                                    /* 0x5c68000004871842 */
+                                                                                                /* 0x101fc480fe2407f1 */
+        /*1ec8*/                   FMUL R67, R28, R72.reuse;                                    /* 0x5c68000004871c43 */
+        /*1ed0*/                   FMUL R68, R58, R72.reuse;                                    /* 0x5c68000004873a44 */
+        /*1ed8*/                   FMUL R69, R62, R72.reuse;                                    /* 0x5c68000004873e45 */
+                                                                                                /* 0x001fd400fe0407f1 */
+        /*1ee8*/                   FMUL R70, R56, R72.reuse;                                    /* 0x5c68000004873846 */
+        /*1ef0*/         {         FMUL R71, R60, R72;                                          /* 0x5c68000004873c47 */
+        /*1ef8*/                   CAL 0x1f10;        }                                         /* 0xe260000001000040 */
+                                                                                                /* 0x001fc400fe0007f5 */
+        /*1f08*/                   EXIT;                                                        /* 0xe30000000007000f */
+        /*1f10*/         {         IADD R80, R80, 0x1;                                          /* 0x3810000000175050 */
+        /*1f18*/                   STS.128 [R93], R64;        }                                 /* 0xef5e000000075d40 */
+                                                                                                /* 0x001fc000fe2007f0 */
+        /*1f28*/         {         IADD R86, R86, 0x1;                                          /* 0x3810000000175656 */
+        /*1f30*/                   STS.128 [R93+0x80], R68;        }                            /* 0xef5e000008075d44 */
+        /*1f38*/         {         IADD R87, R87, 0x1;                                          /* 0x3810000000175757 */
+        /*1f48*/                   LDS R64, [R92];        }                                     /* 0x001fc400fe0007f1 */
+                                                                                                /* 0xef4c000000075c40 */
+        /*1f50*/         {         IADD R88, R88, 0x1;                                          /* 0x3810000000175858 */
+        /*1f58*/                   LDS R65, [R92+0x80];        }                                /* 0xef4c000008075c41 */
+                                                                                                /* 0x101fc000fe2407f0 */
+        /*1f68*/         {         IADD R76, R76, R77.reuse;                                    /* 0x5c10000004d74c4c */
+        /*1f70*/                   LDS R66, [R92+0x100];        }                               /* 0xef4c000010075c42 */
+        /*1f78*/         {         IADD R75, R75, R77.reuse;                                    /* 0x5c10000004d74b4b */
+        /*1f88*/                   LDS R67, [R92+0x180];        }                               /* 0x001fc480fe0007f1 */
+                                                                                                /* 0xef4c000018075c43 */
+        /*1f90*/         {         IADD R78, R78, R77.reuse;                                    /* 0x5c10000004d74e4e */
+        /*1f98*/                   LDS R68, [R92+0x200];        }                               /* 0xef4c000020075c44 */
+                                                                                                /* 0x081fc000fe2007f0 */
+        /*1fa8*/         {         IADD R79, R79, R77;                                          /* 0x5c10000004d74f4f */
+        /*1fb0*/                   LDS R69, [R92+0x280];        }                               /* 0xef4c000028075c45 */
+        /*1fb8*/         {         ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5;           /* 0x4b63028005275007 */
+        /*1fc8*/                   LDS R70, [R92+0x300];        }                               /* 0x001c4400fe0007f1 */
+                                                                                                /* 0xef4c000030075c46 */
+        /*1fd0*/         {         ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6;                 /* 0x4b6303000527500f */
+        /*1fd8*/                   LDS R71, [R92+0x380];        }                               /* 0xef4c000038075c47 */
+                                                                                                /* 0x003fc400fd2207f2 */
+        /*1fe8*/                   ISETP.LT.AND P2, PT, R86.reuse, c[0x0][0x148], P5;           /* 0x4b63028005275617 */
+        /*1ff0*/                   ISETP.LT.AND P3, PT, R86, c[0x0][0x148], P6;                 /* 0x4b6303000527561f */
+        /*1ff8*/               @P0 STG.CG [R76], R64;                                           /* 0xeedc400000004c40 */
+                                                                                                /* 0x001fc000fe2207f0 */
+        /*2008*/         {         ISETP.LT.AND P0, PT, R87.reuse, c[0x0][0x148], P5;           /* 0x4b63028005275707 */
+        /*2010*/               @P1 STG.CG [R76+0x80], R65;        }                             /* 0xeedc400008014c41 */
+        /*2018*/         {         ISETP.LT.AND P1, PT, R87, c[0x0][0x148], P6;                 /* 0x4b6303000527570f */
+        /*2028*/               @P2 STG.CG [R75], R66;        }                                  /* 0x001fc440fe2007f1 */
+                                                                                                /* 0xeedc400000024b42 */
+        /*2030*/                   ISETP.LT.AND P2, PT, R88.reuse, c[0x0][0x148], P5;           /* 0x4b63028005275817 */
+        /*2038*/               @P3 STG.CG [R75+0x80], R67;                                      /* 0xeedc400008034b43 */
+                                                                                                /* 0x001fc400fe2007e9 */
+        /*2048*/                   ISETP.LT.AND P3, PT, R88, c[0x0][0x148], P6;                 /* 0x4b6303000527581f */
+        /*2050*/               @P0 STG.CG [R78], R68;                                           /* 0xeedc400000004e44 */
+        /*2058*/               @P1 STG.CG [R78+0x80], R69;                                      /* 0xeedc400008014e45 */
+                                                                                                /* 0x001fd4003e2007f2 */
+        /*2068*/               @P2 STG.CG [R79], R70;                                           /* 0xeedc400000024f46 */
+        /*2070*/               @P3 STG.CG [R79+0x80], R71;                                      /* 0xeedc400008034f47 */
+        /*2078*/                   RET;                                                         /* 0xe32000000007000f */
+                                                                                                /* 0x001f8000fc0007ff */
+        /*2088*/                   BRA 0x2088;                                                  /* 0xe2400fffff87000f */
+        /*2090*/                   NOP;                                                         /* 0x50b0000000070f00 */
+        /*2098*/                   NOP;                                                         /* 0x50b0000000070f00 */
+                                                                                                /* 0x001f8000fc0007e0 */
+        /*20a8*/                   NOP;                                                         /* 0x50b0000000070f00 */
+        /*20b0*/                   NOP;                                                         /* 0x50b0000000070f00 */
+        /*20b8*/                   NOP;                                                         /* 0x50b0000000070f00 */
+		................................
+
+
diff --git a/Assembler/PascalAs/t/MaxAs-MaxAs.t b/Assembler/PascalAs/t/MaxAs-MaxAs.t
new file mode 100644
index 0000000..ad9e988
--- /dev/null
+++ b/Assembler/PascalAs/t/MaxAs-MaxAs.t
@@ -0,0 +1,5 @@
+use strict;
+use warnings;
+
+use Test::More tests => 1;
+BEGIN { use_ok('MaxAs::MaxAs') };
diff --git a/Kernel/Convolution/Kepler/Makefile b/Kernel/Convolution/Kepler/Makefile
new file mode 100644
index 0000000..8f5ee71
--- /dev/null
+++ b/Kernel/Convolution/Kepler/Makefile
@@ -0,0 +1,28 @@
+BINS := sconv_fprop_K64_N64 sconv_bprop_C64_N64 sconv_update_C128_K128 \
+  sconv_bprop_C1_N64 sconv_fprop_K128_N128 sconv_bprop_C128_N128
+TARGETS := $(addsuffix .cubin, $(BINS))
+TEMPLATES := $(addsuffix _template.cubin, $(BINS))
+
+all: $(BINS) sconv_fprop sconv_bprop sconv_update
+
+$(BINS):
+	nvcc -arch sm_35 -m 64 $@.cu -cubin -O3 -o $@_template.cubin
+	KeplerAs.pl -i $@.sass $@_template.cubin $@.cubin
+
+sconv_fprop: sconv_fprop.cu
+	nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart
+
+sconv_bprop: sconv_bprop.cu
+	nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart
+
+sconv_update: sconv_update.cu
+	nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart
+
+clean:
+	rm $(TARGETS) $(TEMPLATES) sconv_fprop sconv_bprop sconv_update
+
+.PHONY:
+	all clean
+
+#utils
+print-% : ; $(info $* is $(flavor $*) variable set to [$($*)]) @true           
diff --git a/Kernel/Convolution/Kepler/sconv.h b/Kernel/Convolution/Kepler/sconv.h
new file mode 100644
index 0000000..f98ffad
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv.h
@@ -0,0 +1,96 @@
+#include <vector>
+#include <string>
+#include <map>
+#include <cuda.h>
+#include <iostream>
+#include <sstream>
+#include <stdlib.h>
+#include <math.h>
+
+std::map<std::string, CUfunction> nervana_kernels;
+std::vector<CUmodule> nervana_modules;
+
+int len_d2b(int n) {
+  int i, j = 0;
+  i = n;
+  while (i) {
+    i /= 2;
+    j++;
+  }
+  return j;
+}
+
+void magic32(unsigned int nmax, unsigned int d, unsigned int& m, unsigned int& p) {
+  long nc = ((nmax + 1) / d) * d - 1;
+  long nbits = len_d2b(nmax);
+  std::cout << "nbits " << nbits << std::endl;
+  for(p = 0; p < 2 * nbits + 1; p++) {   
+    if(pow(2, p) > nc * (d - 1 - (long)(pow(2, p) - 1) % d)) {
+      m = (pow(2, p) + d - 1 -(long)(pow(2, p) - 1) % d) / d;
+      std::cout << "m " << m << std::endl;
+      std::cout << "p " << p << std::endl;
+      return;
+    }   
+  }   
+  return;
+}
+
+void magic64(unsigned int d, unsigned int& magic, unsigned int& shift) {
+  // 3 is a special case that only ends up in the high bits
+  // if the nmax is 0xffffffff
+  // we can't use 0xffffffff for all cases as some return a 33 bit
+  // magic number
+  unsigned long nmax;
+  if(d == 3)
+    nmax = 0xffffffff;
+  else
+    nmax = 0x7fffffff;
+  magic32(nmax, d, magic, shift);
+  if(magic != 1)
+    shift -= 32;
+}
+
+bool load_kernels(const char* const base_path_cstr) {
+    //better would be a vector<string>, but there is a bug in nvcc that prevents this
+    // (bug report filed)
+    const int NUM_KERNELS = 6;
+    std::string names[NUM_KERNELS] = {
+        "sconv_fprop_K64_N64",
+        "sconv_fprop_K128_N128",
+        "sconv_bprop_C128_N128",
+        "sconv_bprop_C64_N64",
+        "sconv_bprop_C1_N64",
+        "sconv_update_C128_K128"
+    };
+
+    std::string base_path(base_path_cstr);
+
+    for (int i = 0; i < NUM_KERNELS; ++i) {
+      std::string kernel = names[i];
+        if (nervana_kernels.count(kernel) > 0)
+            continue;
+
+        CUmodule module;
+
+        std::string path = base_path + kernel + std::string(".cubin");
+        CUresult res = cuModuleLoad(&module, path.c_str());
+
+        if (res != CUDA_SUCCESS) {
+            std::cerr << "Failed to load: " << kernel << " " << res << std::endl;
+            return false;
+        }
+
+        nervana_modules.push_back(module);
+
+        CUfunction function;
+        res = cuModuleGetFunction(&function, module, kernel.c_str());
+        if (res != CUDA_SUCCESS) {
+            std::cerr << "Failed to extract: " << kernel << " " << res << std::endl;
+            return false;
+        }
+
+        nervana_kernels.insert(std::make_pair(kernel, function));
+    }
+
+    return true;
+}
diff --git a/Kernel/Convolution/Kepler/sconv_bprop.cu b/Kernel/Convolution/Kepler/sconv_bprop.cu
new file mode 100644
index 0000000..de2c980
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_bprop.cu
@@ -0,0 +1,362 @@
+#include "sconv.h"
+
+bool bprop_C128_N128(float *I, const float *F, const float *O,
+  unsigned int N, unsigned int C, unsigned int K,
+  unsigned int D, unsigned int H, unsigned int W,
+  unsigned int R, unsigned int S, unsigned int T,
+  unsigned int M, unsigned int P, unsigned int Q,
+  unsigned int str_d, unsigned int str_h, unsigned int str_w,
+  unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) {
+  float alpha = 1.0f;
+  unsigned int DHW, WN, HW, HWN, DHWN, CRST, RST, RS;
+  unsigned int MPQ, PQ, QN, PQN, MPQN;
+  unsigned int magic_HW, magic_W;
+  unsigned int shift_HW, shift_W;
+  unsigned int magic_RST, magic_RS, magic_S;
+  unsigned int shift_RST, shift_RS, shift_S;
+  unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ;
+  unsigned int magic_str_w, magic_str_h, magic_str_d;
+  unsigned int shift_str_w, shift_str_h, shift_str_d;
+  // input
+  WN = W * N;
+  HW = H * W;
+  HWN = H * WN;
+  DHW = D * HW;
+  DHWN = D * HWN;
+  // filter
+  RS = R * S;
+  RST = T * RS;
+  CRST = C * RS;
+  // output
+  QN = Q * N;
+  PQN = P * QN;
+  MPQN = M * PQN;
+  PQ = P * Q;
+  MPQ = M * P * Q;
+  // magic numbers
+  magic32(MPQ, PQ, magic_PQ, shift_PQ);
+  magic32(PQ, Q, magic_Q, shift_Q);
+  magic32(CRST, RST, magic_RST, shift_RST);
+  magic32(RST + 32, RS, magic_RS, shift_RS);
+  magic32(RS + 32, S, magic_S, shift_S);
+  magic32(W + S - pad_w - 2, str_w, magic_str_w, shift_str_w);
+  magic32(H + R - pad_h - 2, str_h, magic_str_h, shift_str_h);
+  magic32(D + T - pad_d - 2, str_d, magic_str_d, shift_str_d);
+  magic32(DHW, HW, magic_HW, shift_HW);
+  magic32(HW, W, magic_W, shift_W);
+  // test param set up
+  float *test_param;
+  cudaError_t cuda_error;
+  cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024);
+  cudaMemset(test_param, 0, sizeof(float) * 1024);
+  void *args[45] = {
+    &test_param, &I, &O, &F, &alpha,
+    &N, &C, &M, &P, &Q, &QN, &PQN, &MPQN,
+    &K, &CRST, &RST,
+    &RS, &magic_RS, &shift_RS,
+    &S, &magic_S, &shift_S,
+    &pad_d, &pad_h, &pad_w,
+    &str_d, &str_h, &str_w,
+    &W, &HW, &WN, &HWN, &DHWN,
+    &magic_W, &shift_W,
+    &magic_HW, &shift_HW,
+    &R, &T,
+    &magic_str_w, &shift_str_w,
+    &magic_str_h, &shift_str_h,
+    &magic_str_d, &shift_str_d};
+  int gridDWH = D * W * H;
+  int gridX = gridDWH;
+  int gridY = C / 128 + (C % 128 != 0);
+  int gridZ = N / 128 + (N % 128 != 0);
+  std::string kernel_name = "sconv_bprop_C128_N128";
+  CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 256, 1, 1,
+    128 * 8 * 4 + RST * 4 * 2 + 8, 0, args, NULL);
+  if (res != CUDA_SUCCESS) {
+    std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl;
+    return false;
+  }
+  cuCtxSynchronize();
+  float* h_test = (float *)malloc(sizeof(float) * 128);
+  for (int i = 0; i < 128; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 128, cudaMemcpyDeviceToHost);
+  if (cuda_error != cudaSuccess) {
+    std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
+    exit(1);
+  }
+  for (int i = 0; i < 128; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  // free test_param
+  free(h_test);
+  return true;
+}
+
+bool bprop_C64_N64(float *I, const float *F, const float *O,
+  unsigned int N, unsigned int C, unsigned int K,
+  unsigned int D, unsigned int H, unsigned int W,
+  unsigned int R, unsigned int S, unsigned int T,
+  unsigned int M, unsigned int P, unsigned int Q,
+  unsigned int str_d, unsigned int str_h, unsigned int str_w,
+  unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) {
+  float alpha = 1.0f;
+  unsigned int DHW, WN, HW, HWN, DHWN, CRST, RST, RS;
+  unsigned int MPQ, PQ, QN, PQN, MPQN;
+  unsigned int magic_HW, magic_W;
+  unsigned int shift_HW, shift_W;
+  unsigned int magic_RST, magic_RS, magic_S;
+  unsigned int shift_RST, shift_RS, shift_S;
+  unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ;
+  unsigned int magic_str_w, magic_str_h, magic_str_d;
+  unsigned int shift_str_w, shift_str_h, shift_str_d;
+  // input
+  WN = W * N;
+  HW = H * W;
+  HWN = H * WN;
+  DHW = D * HW;
+  DHWN = D * HWN;
+  // filter
+  RS = R * S;
+  RST = T * RS;
+  CRST = C * RS;
+  // output
+  QN = Q * N;
+  PQN = P * QN;
+  MPQN = M * PQN;
+  PQ = P * Q;
+  MPQ = M * P * Q;
+  // magic numbers
+  magic32(MPQ, PQ, magic_PQ, shift_PQ);
+  magic32(PQ, Q, magic_Q, shift_Q);
+  magic32(CRST, RST, magic_RST, shift_RST);
+  magic32(RST + 32, RS, magic_RS, shift_RS);
+  magic32(RS + 32, S, magic_S, shift_S);
+  magic32(W + S - pad_w - 2, str_w, magic_str_w, shift_str_w);
+  magic32(H + R - pad_h - 2, str_h, magic_str_h, shift_str_h);
+  magic32(D + T - pad_d - 2, str_d, magic_str_d, shift_str_d);
+  magic32(DHW, HW, magic_HW, shift_HW);
+  magic32(HW, W, magic_W, shift_W);
+  // test param set up
+  float *test_param;
+  cudaError_t cuda_error;
+  cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024);
+  cudaMemset(test_param, 0, sizeof(float) * 1024);
+  void *args[45] = {
+    &test_param, &I, &O, &F, &alpha,
+    &N, &C, &M, &P, &Q, &QN, &PQN, &MPQN,
+    &K, &CRST, &RST,
+    &RS, &magic_RS, &shift_RS,
+    &S, &magic_S, &shift_S,
+    &pad_d, &pad_h, &pad_w,
+    &str_d, &str_h, &str_w,
+    &W, &HW, &WN, &HWN, &DHWN,
+    &magic_W, &shift_W,
+    &magic_HW, &shift_HW,
+    &R, &T,
+    &magic_str_w, &shift_str_w,
+    &magic_str_h, &shift_str_h,
+    &magic_str_d, &shift_str_d};
+  int gridDWH = D * W * H;
+  int gridX = gridDWH;
+  int gridY = C / 64 + (C % 64 != 0);
+  int gridZ = N / 64 + (N % 64 != 0);
+  std::string kernel_name = "sconv_bprop_C64_N64";
+  CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 64, 1, 1,
+    0, 0, args, NULL);
+  if (res != CUDA_SUCCESS) {
+    std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl;
+    return false;
+  }
+  cuCtxSynchronize();
+  float* h_test = (float *)malloc(sizeof(float) * 64);
+  for (int i = 0; i < 64; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 64, cudaMemcpyDeviceToHost);
+  if (cuda_error != cudaSuccess) {
+    std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
+    exit(1);
+  }
+  for (int i = 0; i < 64; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  // free test_param
+  free(h_test);
+  return true;
+}
+
+bool bprop_C1_N64(float *I, const float *F, const float *O,
+  unsigned int N, unsigned int C, unsigned int K,
+  unsigned int D, unsigned int H, unsigned int W,
+  unsigned int R, unsigned int S, unsigned int T,
+  unsigned int M, unsigned int P, unsigned int Q,
+  unsigned int str_d, unsigned int str_h, unsigned int str_w,
+  unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) {
+  float alpha = 1.0f;
+  unsigned int WN, HWN, DHWN, CRST, RST, RS;
+  unsigned int MPQ, PQ, QN, PQN, MPQN;
+  unsigned int magic_RST, magic_RS, magic_S;
+  unsigned int shift_RST, shift_RS, shift_S;
+  unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ;
+  unsigned int magic_str_w, magic_str_h, magic_str_d;
+  unsigned int shift_str_w, shift_str_h, shift_str_d;
+  unsigned int CRST32, MPQN32;
+  // input
+  WN = W * N;
+  HWN = H * WN;
+  DHWN = D * HWN;
+  // filter
+  RS = R * S;
+  RST = T * RS;
+  CRST = C * RS;
+  // output
+  QN = Q * N;
+  PQN = P * QN;
+  MPQN = M * PQN;
+  PQ = P * Q;
+  MPQ = M * PQ;
+  // special
+  CRST32 = 32 * CRST;
+  MPQN32 = 32 * MPQN;
+  // magic numbers
+  magic32(MPQ, PQ, magic_PQ, shift_PQ);
+  magic32(PQ, Q, magic_Q, shift_Q);
+  magic32(CRST, RST, magic_RST, shift_RST);
+  magic32(RST + 32, RS, magic_RS, shift_RS);
+  magic32(RS + 32, S, magic_S, shift_S);
+  magic32(W + S - pad_w - 2, str_w, magic_str_w, shift_str_w);
+  magic32(H + R - pad_h - 2, str_h, magic_str_h, shift_str_h);
+  magic32(D + T - pad_d - 2, str_d, magic_str_d, shift_str_d);
+  // test param set up
+  float *test_param;
+  cudaError_t cuda_error;
+  cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024);
+  cudaMemset(test_param, 0, sizeof(float) * 1024);
+  void *args[41] = {
+    &test_param, &I, &O, &F, &alpha,
+    &N, &K, &D, &H, &W, &WN, &HWN, &DHWN,
+    &C, &CRST,
+    &RST, &magic_RST, &shift_RST,
+    &RS, &magic_RS, &shift_RS,
+    &S, &magic_S, &shift_S,
+    &pad_d, &pad_h, &pad_w,
+    &str_d, &str_h, &str_w,
+    &Q, &PQ, &QN, &PQN, &MPQN,
+    &magic_Q, &shift_Q,
+    &magic_PQ, &shift_PQ,
+    &CRST32,
+    &MPQN32};
+  int gridMPQ = MPQ;
+  int gridX = gridMPQ;
+  int gridY = CRST / 32 + (CRST % 32 != 0);
+  int gridZ = N / 64 + (N % 64 != 0);
+  const std::string kernel_name = "sconv_bprop_C1_N64";
+  CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 32, 1, 1,
+    0, 0, args, NULL);
+  if (res != CUDA_SUCCESS) {
+    std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl;
+    return false;
+  }
+  cuCtxSynchronize();
+  float* h_test = (float *)malloc(sizeof(float) * 32);
+  cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 32, cudaMemcpyDeviceToHost);
+  if (cuda_error != cudaSuccess) {
+    std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
+    exit(1);
+  }
+  for (int i = 0; i < 32; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  // free test_param
+  free(h_test);
+  return true;
+}
+
+int main(int argc, char** argv) {
+  // init
+  cudaFree(0);
+  // params
+  float *d_I, *d_F, *d_O;
+  unsigned int N = 128, C = 192, K = 192, D = 1, H = 13, W = 13, T = 1, R = 12, S = 12;
+  unsigned int str_d = 1, str_h = 1, str_w = 1;
+  unsigned int pad_d = 0, pad_h = 0, pad_w = 0;
+  unsigned int M, P, Q;
+  cudaError_t cuda_error;
+  // 32, 64, or 128
+  if (argc > 1) {
+    C = atoi(argv[1]);
+  }   
+  M = (D - T + 2 * pad_d) / str_d + 1;
+  P = (H - R + 2 * pad_h) / str_h + 1;
+  Q = (W - S + 2 * pad_w) / str_w + 1;
+  float *h_O = (float *)malloc(K * M * P * Q * N * sizeof(float));
+  for (int i = 0; i < K * M * P * Q * N; ++i) {
+    h_O[i] = 1;
+  }
+  float *h_F = (float *)malloc(K * R * S * T * C * sizeof(float));
+  for (int i = 0; i < K * C * R * S * T; ++i) {
+    h_F[i] = 1;
+  }
+  float* h_I = (float *)malloc(sizeof(float) * C * D * H * W * N);
+  // device memory
+  cudaMalloc((void**)&d_I, sizeof(float) * C * D * H * W * N);
+  cudaMalloc((void**)&d_F, sizeof(float) * K * R * S * T * C * 2);
+  cudaMalloc((void**)&d_O, sizeof(float) * K * M * P * Q * N);
+  // memcpy h_O, h_F
+  cudaMemcpy(d_O, h_O, sizeof(float) * M * P * Q * K * N,
+    cudaMemcpyHostToDevice);
+  cudaMemcpy(d_F, h_F, sizeof(float) * K * R * S * T * C,
+    cudaMemcpyHostToDevice);
+  // load kernels 
+  if (!load_kernels("./")) {
+    std::cerr << "Couldn't load all kernels" << std::endl;
+    exit(1);
+  }
+  if (C % 64 != 0) {
+    // launch kernel C1
+    if (!bprop_C1_N64(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) {
+      std::cerr << "Launch error C1" << std::endl;
+      exit(1);
+    }
+  } else {
+    // launch kernel C64
+    if (C <= 64) {
+      if (!bprop_C64_N64(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) {
+        std::cerr << "Launch error C64" << std::endl;
+        exit(1);
+      }
+    } else {
+      if (!bprop_C128_N128(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) {
+        std::cerr << "Launch error C128" << std::endl;
+        exit(1);
+      }
+    }
+  }
+  // output
+  std::cout << "result" << std::endl;
+  cuda_error = cudaMemcpy(h_I, d_I, sizeof(float) * C * D * H * W * N, cudaMemcpyDeviceToHost);
+  if (cuda_error != cudaSuccess) {
+    std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
+    exit(1);
+  }
+  for (int i = 0; i < 128; ++i) {
+    std::cout << h_I[i] << " ";
+  }
+  std::cout << std::endl;
+  // free memory
+  free(h_O);
+  free(h_I);
+  free(h_F);
+  cudaFree(d_I);
+  cudaFree(d_F);
+  cudaFree(d_O);
+  // run successfully
+  std::cout << "finish" << std::endl;
+  return 0;
+}
diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.cu b/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.cu
new file mode 100644
index 0000000..dddde07
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.cu
@@ -0,0 +1,56 @@
+extern "C"
+__global__ void sconv_bprop_C128_N128 (
+  float* param_test,
+  float* param_O,
+  const float* param_I,
+  const float* param_F,
+  float param_alpha,
+  int param_N,
+  int param_K,
+  int param_D,
+  int param_H,
+  int param_W,
+  int param_WN,
+  int param_HWN,
+  int param_DHWN,
+  int param_C,
+  int param_CRST,
+  int param_RST,
+  int param_RS,
+  int param_magic_RS,
+  int param_shift_RS,
+  int param_S,
+  int param_magic_S,
+  int param_shift_S,
+  int param_pad_d,
+  int param_pad_h,
+  int param_pad_w,
+  int param_str_d,
+  int param_str_h,
+  int param_str_w,
+  int param_Q,
+  int param_PQ,
+  int param_QN,
+  int param_PQN,
+  int param_MPQN,
+  int param_magic_Q,
+  int param_shift_Q,
+  int param_magic_PQ,
+  int param_shift_PQ,
+  int param_R,
+  int param_T,
+  int param_magic_str_w,
+  int param_shift_str_w,
+  int param_magic_str_h,
+  int param_shift_str_h,
+  int param_magic_str_d,
+  int param_shift_str_d) {
+  __shared__ float share[128 * 8 * 4 + 8];
+
+  int tid = threadIdx.x;
+
+  share[tid] = 1;
+
+  *param_O = share[127-tid];
+  *param_test = share[127-tid];
+}
diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.sass b/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.sass
new file mode 100644
index 0000000..c7cb6e5
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.sass
@@ -0,0 +1,785 @@
+# Kernel: sconv_bprop_C128_N128
+// debug:
+// mode1
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//-:-:-:-:00 SHL tmp_shl, tid, 0x2;
+//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0;
+//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1;
+//-:-:-:-:00 I2F.F32.U32 rst, rst;
+//-:-:-:-:00 ST.E [tmp_param00], rst;
+//-:-:-:-:00 EXIT;
+
+// mode2
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//
+//-:-:-:-:00 MOV32I k, 0x40000000;
+//-:-:-:-:00 ST.E [tmp_param0], k;
+//-:-:-:-:00 EXIT;
+
+// modify steps:
+// XMAD->IMAD
+// shared memory addresses->RZ
+// LDG->LD
+// LEA->MOV, IADD, SHL
+// XMAD.LO2C->IMAD.U32.U32
+// XMAD.PSL->IMAD.U32.U32
+// VMAD->IMAD, IADD
+// MOV->MOV32I
+// IADD3->IADD, IADD
+// POPC
+// ST.CG->ST
+// control code
+// comments
+// LDS.U->LDS
+// register<0-7>->register<0-3>, register<4-7>
+// avoid register conflicts
+// tid->other register
+
+// optimization steps:
+// alexnet2
+// initial->1400
+// bank conflict->1400
+// alignment+dual issue+reuse->2100
+// all ldg.128->1900
+// control codes->2000
+// reduce unnecessary instructions->2100
+// scheduling->1937
+
+<CONSTANT_MAPPING>
+    szShareF  : (128*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<128*8*2 + 128*8*2 + 0>
+    addr_m     : 4x<128*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<128*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<128*8*2 + 128*8*2 + 6>
+    addr_szLut : 4x<128*8*2 + 128*8*2 + 7>
+    addr_lut   : 4x<128*8*2 + 128*8*2 + 8>
+
+    param_test[0]     : c[0x0][0x140]
+    param_test[1]     : c[0x0][0x144]
+    param_O[0]        : c[0x0][0x148]
+    param_O[1]        : c[0x0][0x14c]
+    param_I[0]        : c[0x0][0x150]
+    param_I[1]        : c[0x0][0x154]
+    param_F[0]        : c[0x0][0x158]
+    param_F[1]        : c[0x0][0x15c]
+    param_alpha       : c[0x0][0x160]
+    param_N           : c[0x0][0x164]
+    param_K           : c[0x0][0x168]
+    param_D           : c[0x0][0x16c]
+    param_H           : c[0x0][0x170]
+    param_W           : c[0x0][0x174]
+    param_WN          : c[0x0][0x178]
+    param_HWN         : c[0x0][0x17c]
+    param_DHWN        : c[0x0][0x180]
+    param_C           : c[0x0][0x184]
+    param_KRST        : c[0x0][0x188]
+    param_RST         : c[0x0][0x18c]
+    param_RS          : c[0x0][0x190]
+    param_magic_RS    : c[0x0][0x194]
+    param_shift_RS    : c[0x0][0x198]
+    param_S           : c[0x0][0x19c]
+    param_magic_S     : c[0x0][0x1a0]
+    param_shift_S     : c[0x0][0x1a4]
+    param_pad_d       : c[0x0][0x1a8]
+    param_pad_h       : c[0x0][0x1ac]
+    param_pad_w       : c[0x0][0x1b0]
+    param_str_d       : c[0x0][0x1b4]
+    param_str_h       : c[0x0][0x1b8]
+    param_str_w       : c[0x0][0x1bc]
+    param_Q           : c[0x0][0x1c0]
+    param_PQ          : c[0x0][0x1c4]
+    param_QN          : c[0x0][0x1c8]
+    param_PQN         : c[0x0][0x1cc]
+    param_MPQN        : c[0x0][0x1d0]
+    param_magic_Q     : c[0x0][0x1d4]
+    param_shift_Q     : c[0x0][0x1d8]
+    param_magic_PQ    : c[0x0][0x1dc]
+    param_shift_PQ    : c[0x0][0x1e0]
+    param_R           : c[0x0][0x1e4]
+    param_T           : c[0x0][0x1e8]
+    param_magic_str_w : c[0x0][0x1ec]
+    param_shift_str_w : c[0x0][0x1f0]
+    param_magic_str_h : c[0x0][0x1f4]
+    param_shift_str_h : c[0x0][0x1f8]
+    param_magic_str_d : c[0x0][0x1fc]
+    param_shift_str_d : c[0x0][0x200]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-67 : mpq<0-3>
+    64-67 : m, p, q, tidY
+    68-72 : blkF, blkI, blkMPQ, tid1, tidX
+    73-107 ~ str_d, str_h, str_w, pq, mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, one, rst_prime, x_prime, y_prime, z_prime, ballot, warp_slices, partial, endCRST
+
+    0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+      64-67 : j0Fy<0-3>
+      68-71 : j0Ix<0-3>
+      72-75 : j0Fy<4-7>
+      76-79 : j0Ix<4-7>
+      80-83 : j1Fy<0-3>
+      84-87 : j1Ix<0-3>
+      88-91 : j1Fy<4-7>
+      92-95 : j1Ix<4-7>
+
+      96-97 : trackI<0-1>
+      98-99 : trackF<0-1>
+
+    100-103 : loadI<0-3>
+    104-107 : loadF<0-3>
+    109 : readFs
+    108 : readIs
+    
+    110-114 ~ offsetIn, offsetFk, posCRST, lutSize, lutSizeRcp
+    115-120 ~ writeS, posCRSTf, channel, lutOffset, offsetI, offsetF
+    116-120 ~ tid128, tid, p_and
+    121 : tmp_shl
+
+    122-123 : sliceI, sliceF
+    122-123 : sliceIF<0-1>
+    124-125 ~ offsetIc, offsetFc
+    124-125 : tmp_param<0-1>
+    124-127 ~ addressF0, addressF1, addressI0, addressI1
+
+    72-79  : cs<0-7>
+    80-81  : Out<0-1>
+    82-125 ~ writeCs, readCs, alpha, tidOX, tidOX2, tidOY, to, k, n, MPQN1, MPQN60, MPQN, MPQN4
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,    SR_TID.X;
+-:-:-:-:00 S2R blkF,   SR_CTAID.Y;
+-:-:-:-:00 S2R blkI,   SR_CTAID.Z;
+-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; # m,p,q stored in x index
+
+-:-:-:-:00 ISETP.GE.AND P0, PT, tid, 32, PT;
+
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+
+<CODE>
+    return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tid <= 255
+// tidX = (tid & 31) << 2
+// tidX = 0 : 4 : 128
+// tidY = tid >> 5
+// tidY = 0 : 1 : 7
+-:-:-:-:00 LOP.AND tidX, tid,  31;
+-:-:-:-:00 SHL     tidX, tidX, 2;
+-:-:-:-:00 SHR.U32 tidY, tid,  5;
+
+// trackF += blkF*128 + tidX
+-:-:-:-:00 ISCADD offsetFk, blkF, tidX, 7;
+
+// trackI += blkI*128 + tidX
+-:-:-:-:00 ISCADD offsetIn, blkI, tidX, 7;
+
+// writeS = (128*tidY + tidX) * 4
+-:-:-:-:00 SHR tidX, tidX, 1; 
+-:-:-:-:00 ISCADD  writeS, tidY, tidX, 7;
+-:-:-:-:00 SHL     writeS, writeS, 2;
+
+// rieadFs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+// [6][5][4][0] * 8;
+-:-:-:-:00 LOP.AND tid1,   tid,    1;
+-:-:-:-:00 LOP.AND readFs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readFs, readFs, 3;
+-:-:-:-:00 LOP.OR  readFs, readFs, tid1;
+-:-:-:-:00 SHL     readFs, readFs, 3;
+
+// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7)
+// [3][2][1] * 16;
+-:-:-:-:00 LOP.AND tid128, tid, 128;
+-:-:-:-:00 SHR.U32 tid128, tid128, 3;
+-:-:-:-:00 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1
+-:-:-:-:00 LOP.OR  readIs, readIs, tid128;
+-:-:-:-:00 ISCADD  readIs, readIs, 4x<szShareF>, 3;
+
+-:-:-:-:00 @P0 BRA.U END_SETUP;
+
+-:-:-:-:00 MOV str_d, param_str_d;
+-:-:-:-:00 MOV str_h, param_str_h;
+-:-:-:-:00 MOV str_w, param_str_w;
+-:-:-:-:00 MOV rst, tid;
+-:-:-:-:00 MOV lutStore2, RZ;
+-:-:-:-:00 MOV lutSize, RZ;
+-:-:-:-:00 MOV32I warp_count, 32;
+
+// m  = blkMPQ / PQ
+// pq = blkMPQ % PQ
+-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ;
+-:-:-:-:00 SHR.U32   m, m, param_shift_PQ;
+-:-:-:-:00 IMAD      pq, m, param_PQ, RZ;
+-:-:-:-:00 IADD      pq, -pq, blkMPQ;
+// p = pq / Q
+// q = pq % Q
+-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ;
+-:-:-:-:00 SHR.U32   p, p, param_shift_Q;
+-:-:-:-:00 IMAD      q, p, param_Q, RZ;
+-:-:-:-:00 IADD      q, -q, pq;
+
+-:-:-:-:00 MOV32I dep_thd_mask, -1;
+
+-:-:-:-:00 LOP.AND p_and, p, 1;
+-:-:-:-:00 ISETP.NE.AND P1, PT, p_and, RZ, PT;
+-:-:-:-:00 @P1 IADD q, -q, param_Q;
+-:-:-:-:00 @P1 IADD q, q, dep_thd_mask;
+
+-:-:-:-:00 STS.128 [RZ + addr_m], m;
+
+// qs = q - S + pad_w + 1
+-:-:-:-:00 MOV32I one, 1;
+-:-:-:-:00 IADD qs, q, -param_S;
+-:-:-:-:00 IADD qs, qs, param_pad_w;
+-:-:-:-:00 IADD qs, qs, one;
+
+// pr = p - R + pad_h + 1
+-:-:-:-:00 IADD pr, p, -param_R;
+-:-:-:-:00 IADD pr, pr, param_pad_h;
+-:-:-:-:00 IADD pr, pr, one;
+
+// mt = m - T + pad_d + 1
+-:-:-:-:00 IADD mt, m, -param_T;
+-:-:-:-:00 IADD mt, mt, param_pad_d;
+-:-:-:-:00 IADD mt, mt, one;
+
+-:-:-:-:00 IADD mask_shr, -tid, 32;
+-:-:-:-:00 SHR.U32 dep_thd_mask, dep_thd_mask, mask_shr;
+
+LUT_LOOP:
+
+// warp synchronous loop while warp_count < RST
+-:-:-:-:00 ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
+-:-:-:-:00 IADD warp_count, warp_count, 32;
+// t =  rst / RS
+// rs = rst % RS
+-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ;
+-:-:-:-:00 SHR.U32 t, t, param_shift_RS;
+-:-:-:-:00 IMAD rs, t, param_RS, RZ;
+-:-:-:-:00 IADD rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ;
+-:-:-:-:00 SHR.U32 r, r, param_shift_S;
+-:-:-:-:00 IMAD s, r, param_S, RZ;
+-:-:-:-:00 IADD s, -s, rs;
+// x = qs + s
+// y = pr + r
+// z = mt + t
+-:-:-:-:00 IADD x, qs, s;
+-:-:-:-:00 IADD y, pr, r;
+-:-:-:-:00 IADD z, mt, t;
+-:-:-:-:00 ISETP.GE.AND  P4, PT, x, RZ, PT;
+-:-:-:-:00 ISETP.GE.AND  P5, PT, y, RZ, PT;
+-:-:-:-:00 ISETP.GE.AND  P6, PT, z, RZ, PT;
+// rst_prime = t*RS + r*S + s
+// s = S - s - 1
+-:-:-:-:00 IADD s, -s, param_S;
+-:-:-:-:00 IADD s, s, -one;
+// r = R - r - 1
+-:-:-:-:00 IADD r, -r, param_R;
+-:-:-:-:00 IADD r, r, -one;
+// t = T - t - 1
+-:-:-:-:00 IADD t, -t, param_T;
+-:-:-:-:00 IADD t, t, -one;
+
+-:-:-:-:00 IMAD  rst_prime, r, param_S,  s;
+-:-:-:-:00 IMAD  rst_prime, t, param_RS, rst_prime;
+
+// x_prime = x / str_w
+// x       = x % str_w
+-:-:-:-:00 IMAD    x_prime, x, param_magic_str_w, RZ;
+-:-:-:-:00 SHR.U32 x_prime, x_prime, param_shift_str_w;
+-:-:-:-:00 IMAD tmp_param0, str_w, x_prime, RZ;
+-:-:-:-:00 IADD x, -tmp_param0, x;
+// y_prime = y / str_h
+// y       = y % str_h
+-:-:-:-:00 IMAD    y_prime, y, param_magic_str_h, RZ;
+-:-:-:-:00 SHR.U32 y_prime, y_prime, param_shift_str_h;
+-:-:-:-:00 IMAD tmp_param0, str_h, y_prime, RZ;
+-:-:-:-:00 IADD y, -tmp_param0, y;
+// z_prime = z / str_d
+// z       = z % str_d
+-:-:-:-:00 IMAD    z_prime, z, param_magic_str_d, RZ;
+-:-:-:-:00 SHR.U32 z_prime, z_prime, param_shift_str_d;
+-:-:-:-:00 IMAD tmp_param0, str_d, z_prime, RZ;
+-:-:-:-:00 IADD z, -tmp_param0, z;
+
+// calculate x_prime only when x % str_w == 0
+// it may be greater than Q due to its location
+-:-:-:-:00 ISETP.EQ.AND  P4, PT, x, RZ, P4;
+-:-:-:-:00 ISETP.EQ.AND  P5, PT, y, RZ, P5;
+-:-:-:-:00 ISETP.EQ.AND  P6, PT, z, RZ, P6;
+-:-:-:-:00 ISETP.LT.AND  P4, PT, x_prime, param_W, P4;
+-:-:-:-:00 ISETP.LT.AND  P5, PT, y_prime, param_H, P5;
+-:-:-:-:00 ISETP.LT.AND  P6, PT, z_prime, param_D, P6;
+-:-:-:-:00 PSETP.AND.AND P1, PT, P4, P5, P6;
+
+// sliceI = z_prime*HWN + y_prime*WN + x_prime*N
+-:-:-:-:00 IMAD      sliceI, x_prime, param_N,   RZ;
+-:-:-:-:00 IMAD.U32.U32 sliceI, y_prime, param_WN,  sliceI;
+-:-:-:-:00 IMAD.U32.U32 sliceI, z_prime, param_HWN, sliceI;
+// sliceF = rst_prime * K
+-:-:-:-:00 IMAD sliceF, rst_prime, param_K, RZ;
+
+// Get a mask of all valid slices in the warp
+-:-:-:-:00 VOTE.ANY ballot, PT, P1;
+// Count the total valid slices
+-:-:-:-:00 POPC warp_slices, ballot, ballot;
+// Prepare lutStore for this and next loop
+-:-:-:-:00 @P1 MOV    lutStore, lutStore2;
+-:-:-:-:00 ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+-:-:-:-:00 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+-:-:-:-:00 @P1 POPC dep_thd_cnt, dep_thd_bits, dep_thd_bits;
+// use the rst increment to space the barrier sync
+-:-:-:-:00 IADD rst, rst, 32;
+// Update the lutStore address from this count
+-:-:-:-:00 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+-:-:-:-:00 @P1 STS.64 [lutStore + addr_lut], sliceIF;
+// Keep track of the total size of the lut
+-:-:-:-:00 IADD lutSize, lutSize, warp_slices;
+
+-:-:-:-:00 @P0 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+-:-:-:-:00 STS [RZ + addr_szLut], lutSize;
+
+END_SETUP:
+
+-:-:-:-:00 BAR.SYNC 0;
+
+// Grab the caclulated lut size and get it's reciprical
+// Get the total reduction depth
+-:-:-:-:00 LDS lutSize, [RZ + addr_szLut];
+-:-:-:-:00 IMAD endCRST, lutSize, param_C, RZ;
+-:-:-:-:00 I2F.F32.S32 lutSizeRcp, lutSize;
+-:-:-:-:00 MUFU.RCP lutSizeRcp, lutSizeRcp;
+
+// posCRST = endCRST - tidY - 1
+-:-:-:-:00 IADD posCRST, endCRST, -1;
+-:-:-:-:00 IADD posCRST, posCRST, -tidY;
+// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 8 then make a full 8 line fetch.
+-:-:-:-:00 LOP.AND partial, endCRST, 7;
+-:-:-:-:00 ISETP.EQ.AND P1, PT, RZ, partial, PT;
+-:-:-:-:00 @P1 MOV32I partial, 8;
+// channel = posCRST / lutSize
+// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
+-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST;
+-:-:-:-:00 FMUL channel, posCRSTf, lutSizeRcp;
+-:-:-:-:00 FFMA channel, channel, 5.9604644775390625e-08, channel;
+-:-:-:-:00 F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+-:-:-:-:00 IMAD tmp_param0, channel, lutSize, RZ;
+-:-:-:-:00 IADD lutOffset, -tmp_param0, posCRST;
+
+-:-:-:-:00 SHL lutOffset, lutOffset, 3;
+// P1 = tidY < partial
+-:-:-:-:00 ISETP.LT.AND P1, PT, tidY, partial, PT;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+-:-:-:-:00 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ;
+-:-:-:-:00 IMAD offsetFc, channel, param_KRST, RZ;
+// posCRST -= partial
+-:-:-:-:00 IADD posCRST, posCRST, -partial;
+-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut];
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc;
+-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF;
+// trackF = offsetFK + offsetFC + sliceF + param_F
+-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc;
+-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI;
+
+//-:-:-:-:00 @P1 LEA      trackF0.CC, offsetF, param_F[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;
+-:-:-:-:00 @P1 MOV tmp_param0, param_F[0];
+-:-:-:-:00 @P1 MOV tmp_param1, param_F[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2;
+-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 @P1 IADD.X trackF1, RZ, tmp_param1;
+//-:-:-:-:00 @P1 LEA      trackI0.CC, offsetI, param_I[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;
+-:-:-:-:00 @P1 MOV tmp_param0, param_I[0];
+-:-:-:-:00 @P1 MOV tmp_param1, param_I[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2;
+-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 @P1 IADD.X trackI1, RZ, tmp_param1;
+
+-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF];
+-:-:-:-:00 @!P1 LDS.128 loadF0, [RZ + addr_zero];
+
+-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI];
+-:-:-:-:00 @!P1 LDS.128 loadI0, [RZ + addr_zero];
+
+-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT;
+
+-:-:-:-:00 STS.64 [writeS], loadF0;
+-:-:-:-:00 STS.64 [writeS + 4x<64>], loadF2;
+-:-:-:-:00 STS.64 [writeS + 4x<szShareF>], loadI0;
+-:-:-:-:00 STS.64 [writeS + 4x<szShareF+64>], loadI2;
+
+-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST;
+
+-:-:-:-:00 BAR.SYNC 0;
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;
+
+-:-:-:-:00 LDS.64 j0Ix0, [readIs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS.64 j0Ix2, [readIs + 4x<0*128 + 64>];
+-:-:-:-:00 LDS.64 j0Fy0, [readFs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS.64 j0Fy2, [readFs + 4x<0*128 + 64>];
+
+-:-:-:-:00 LDS.64 j0Ix4, [readIs + 4x<0*128 + 16>];
+-:-:-:-:00 LDS.64 j0Ix6, [readIs + 4x<0*128 + 80>];
+-:-:-:-:00 LDS.64 j0Fy4, [readFs + 4x<0*128 + 32>];
+-:-:-:-:00 LDS.64 j0Fy6, [readFs + 4x<0*128 + 96>];
+
+// channel = posCRST / lutSize
+-:-:-:-:00 @P1 FMUL channel, posCRSTf, lutSizeRcp;
+-:-:-:-:00 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel;
+-:-:-:-:00 @P1 F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+-:-:-:-:00 @P1 IMAD tmp_param0, channel, lutSize, RZ;
+-:-:-:-:00 @P1 IADD lutOffset, -tmp_param0, posCRST;
+-:-:-:-:00 @P1 SHL lutOffset, lutOffset, 3;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+-:-:-:-:00 @P1 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ;
+-:-:-:-:00 @P1 IMAD      offsetFc, channel, param_KRST, RZ;
+
+-:-:-:-:00 IADD posCRST, posCRST, -8;
+-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut];
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc;
+-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF;
+// trackF = offsetFK + offsetFC + sliceF + param_F
+-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc;
+-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI;
+//-:-:-:-:00 @P1 LEA      trackF0.CC, offsetF, param_F[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;
+-:-:-:-:00 MOV addressF0, param_F[0];
+-:-:-:-:00 MOV addressF1, param_F[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2;
+-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, addressF0;
+-:-:-:-:00 @P1 IADD.X trackF1, RZ, addressF1;
+//-:-:-:-:00 @P1 LEA      trackI0.CC, offsetI, param_I[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;
+-:-:-:-:00 MOV addressI0, param_I[0];
+-:-:-:-:00 MOV addressI1, param_I[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2;
+-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, addressI0;
+-:-:-:-:00 @P1 IADD.X trackI1, RZ, addressI1;
+-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF];
+-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI];
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+
+LOOP:
+
+<CODE>
+    my %insert =
+    (
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c53 => "-:-:-:-:00 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+        j0c61 => "-:-:-:-:00 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+        j0c63 => "-:-:-:-:00 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+
+        j1c47 => "-:-:-:-:00 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j1c63 => "-:-:-:-:00 \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j2c47 => "-:-:-:-:00 \@P1 IMAD lutOffset, -channel, lutSize, posCRST;\n",
+        j2c53 => "-:-:-:-:00 \@P1 IMAD offsetF, channel, param_KRST, offsetFk;\n",
+        j2c61 => "-:-:-:-:00 \@P1 IMAD offsetI, channel, param_DHWN, offsetIn;\n",
+        j2c62 => "-:-:-:-:00 \@P1 SHL lutOffset, lutOffset, 3;\n",
+        j2c63 => "-:-:-:-:00 IADD posCRST, posCRST, -8;\n",
+
+        j3c47 => "-:-:-:-:00 \@P1 LDS.64 sliceIF, [lutOffset + addr_lut];\n",
+        j3c53 => "-:-:-:-:00 TEXDEPBAR 0x0;\n",
+        j3c61 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<szShareF>], loadI0;\n",
+        j3c62 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<szShareF+64>], loadI2;\n",
+        j3c63 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<0>], loadF0;\n",
+
+        j4c47 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<64>], loadF2;\n",
+        j4c53 => "-:-:-:-:00 \@P1 IADD offsetF, offsetF, sliceF;\n",
+        j4c61 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetF, 0x2;\n",
+        j4c62 => "-:-:-:-:00 \@P1 IADD offsetI, offsetI, sliceI;\n",
+        j4c63 => "-:-:-:-:00 \@P1 IADD trackF0.CC, tmp_shl, addressF0;\n",
+
+        j5c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetI, 0x2;\n",
+        j5c53 => "-:-:-:-:00 \@P1 IADD trackI0.CC, tmp_shl, addressI0;\n",
+        j5c61 => "-:-:-:-:00 \@P1 IADD.X trackF1, RZ, addressF1;\n",
+        j5c62 => "-:-:-:-:00 \@P1 IADD.X trackI1, RZ, addressI1;\n",
+
+        j6c47 => "-:G:D:-:00 \@P1 LDG.E.128 loadF0, [trackF];\n",
+        j6c53 => "-:G:D:-:00 \@P1 LDG.E.128 loadI0, [trackI];\n",
+        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readIs, readIs, 4x<szShareF+szShareI>;\n",
+        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readFs, readFs, 4x<szShareF+szShareI>;\n",
+        j6c63 => "-:-:-:-:00 \@P0 BAR.SYNC 0;\n",
+
+        j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<szShareF+szShareI>;\n",
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n",
+    );
+
+    my @cOrder;
+
+    push  @cOrder, [0,0];
+    push  @cOrder, [0,1];
+    push  @cOrder, [1,1];
+    push  @cOrder, [2,0];
+    push  @cOrder, [1,0];
+    push  @cOrder, [2,1];
+    push  @cOrder, [2,3];
+    push  @cOrder, [2,2];
+    push  @cOrder, [1,2];
+    push  @cOrder, [0,3];
+    push  @cOrder, [1,3];
+    push  @cOrder, [0,2];
+    push  @cOrder, [0,4];
+    push  @cOrder, [0,5];
+    push  @cOrder, [1,5];
+    push  @cOrder, [2,4];
+    push  @cOrder, [1,4];
+    push  @cOrder, [2,5];
+    push  @cOrder, [2,7];
+    push  @cOrder, [2,6];
+    push  @cOrder, [1,6];
+    push  @cOrder, [0,7];
+    push  @cOrder, [1,7];
+    push  @cOrder, [0,6];
+    push  @cOrder, [3,6];
+    push  @cOrder, [3,7];
+    push  @cOrder, [4,7];
+    push  @cOrder, [5,6];
+    push  @cOrder, [4,6];
+    push  @cOrder, [5,7];
+    push  @cOrder, [5,5];
+    push  @cOrder, [5,4];
+    push  @cOrder, [4,4];
+    push  @cOrder, [3,5];
+    push  @cOrder, [4,5];
+    push  @cOrder, [3,4];
+    push  @cOrder, [3,2];
+    push  @cOrder, [3,3];
+    push  @cOrder, [4,3];
+    push  @cOrder, [5,2];
+    push  @cOrder, [4,2];
+    push  @cOrder, [5,3];
+    push  @cOrder, [5,1];
+    push  @cOrder, [5,0];
+    push  @cOrder, [4,0];
+    push  @cOrder, [3,1];
+    push  @cOrder, [4,1];
+    push  @cOrder, [3,0];
+    push  @cOrder, [6,0];
+    push  @cOrder, [7,0];
+    push  @cOrder, [7,1];
+    push  @cOrder, [6,2];
+    push  @cOrder, [6,1];
+    push  @cOrder, [7,2];
+    push  @cOrder, [7,5];
+    push  @cOrder, [6,5];
+    push  @cOrder, [6,4];
+    push  @cOrder, [7,3];
+    push  @cOrder, [7,4];
+    push  @cOrder, [6,3];
+    push  @cOrder, [6,6];
+    push  @cOrder, [6,7];
+    push  @cOrder, [7,7];
+    push  @cOrder, [7,6]; 
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c5"}  = sprintf "-:G:D:-:01 %s LDS.64 j%dIx0, [readIs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx2, [readIs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx4, [readIs + 4x<%d*128 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx6, [readIs + 4x<%d*128 + 80>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy0, [readFs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy2, [readFs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy4, [readFs + 4x<%d*128 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy6, [readFs + 4x<%d*128 + 96>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $ctrl   = "-:-:-:-:00";
+
+            if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) {
+              $ins = "-:G:D:-:00 NOP;\n";   
+            }
+
+            if ($c > 60 && !$ins){
+              $ins = "-:-:D:-:07 NOP;\n";
+            }
+
+            # 04 and 05 are dual issued
+            if($ins) {
+              $ctrl = "-:-:D:-:04";
+            } else {
+              if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){
+                $ctrl = "-:-:D:-:04";
+              }
+              else{
+                $ctrl = "-:-:D:-:05";
+              }
+            }
+
+            $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 LDS.128 mpq, [RZ + addr_m];
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkI, SR_CTAID.Z;
+-:-:-:-:00 S2R blkF, SR_CTAID.Y;
+
+// tidOX = (tid & 7) << 2 + (tid & 128) >> 1
+// tidOY = (tid & 127) >> 3
+-:-:-:-:00 LOP.AND tidOX,  tid,    7;
+-:-:-:-:00 SHL     tidOX,  tidOX,  2;
+-:-:-:-:00 LOP.AND tidOX2, tid,    128;
+-:-:-:-:00 SHR.U32 tidOX2, tidOX2, 1;
+-:-:-:-:00 LOP.OR  tidOX,  tidOX,  tidOX2;
+-:-:-:-:00 LOP.AND tidOY,  tid,    127;
+-:-:-:-:00 SHR.U32 tidOY,  tidOY,  3;
+
+-:-:-:-:00 SHL readFs, readFs, 1;
+-:-:-:-:00 SHL readIs, readIs, 1;
+-:-:-:-:00 LOP.AND readIs, readIs, 0x1ff;
+-:-:-:-:00 LOP.AND readFs, readFs, 0x0ff;
+
+// Div by 4 here collapses k stride
+// writeCs = (readKs / 4) * 128 + readNs;
+-:-:-:-:00 ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs = 4 * (tidOX + (tidOY * 128))
+-:-:-:-:00 ISCADD readCs, tidOY, tidOX, 7;
+-:-:-:-:00 SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+-:-:-:-:00 ISCADD n, blkI, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*128 + tidOY * 4
+-:-:-:-:00 SHL    tidOY,   tidOY, 2;
+-:-:-:-:00 ISCADD k, blkF, tidOY, 7;
+
+// o = k*MPQN + m*PQN + p*QN + q*N + n
+-:-:-:-:00 IMAD      to, q, param_N,    n;
+-:-:-:-:00 IMAD.U32.U32 to, p, param_QN,   to;
+-:-:-:-:00 IMAD.U32.U32 to, m, param_PQN,  to;
+-:-:-:-:00 IMAD.U32.U32 to, k, param_MPQN, to;
+//-:-:-:-:00 LEA      Out0.CC, to, param_O[0],     2;
+//-:-:-:-:00 LEA.HI.X Out1,    to, param_O[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_O[0];
+-:-:-:-:00 MOV tmp_param1, param_O[1];
+-:-:-:-:00 SHL tmp_shl, to, 0x2;
+-:-:-:-:00 IADD Out0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X Out1, RZ, tmp_param1;
+
+-:-:-:-:00 MOV  MPQN,  param_MPQN;
+-:-:-:-:00 SHL  MPQN1, MPQN, 2;
+-:-:-:-:00 SHL  MPQN4, MPQN, 4;
+-:-:-:-:00 ISCADD MPQN60, MPQN, -MPQN4, 8;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, n, param_N, PT; // n +  0 < N
+-:-:-:-:00 IADD n, n, 32;
+-:-:-:-:00 ISETP.LT.AND P1, PT, n, param_N, PT; // n + 32 < N
+
+-:-:-:-:00 MOV alpha, param_alpha;
+
+-:-:-:-:00 BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        if ($y == 4)
+        {
+            $out .= sprintf(
+                "-:-:-:-:00 IADD Out0.CC, Out0, MPQN60;\n" .
+                "-:-:-:-:00 IADD k, k, 60;\n" .
+                "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n" .
+                "-:-:-:-:00 IADD.X Out1, Out1, RZ;\n\n",
+                ($y) x 8);
+        }
+        else
+        {
+            $out .= sprintf(
+                "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n\n",
+                ($y) x 8);
+        }
+
+        $out .= "-:-:-:-:00 CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 EXIT;
+
+STORE_C:
+
+-:-:-:-:00 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n +  0 < N
+-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n + 32 < N
+-:-:-:-:00 IADD k, k, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+-:-:-:-:00 STS.128 [writeCs + 4x<00>], cs0;
+-:-:-:-:00 STS.128 [writeCs + 4x<32>], cs4;
+-:-:-:-:00 LDS.128 cs0, [readCs + 4x<00>];
+-:-:-:-:00 LDS.128 cs4, [readCs + 4x<32>];
+
+// Store results back to global
+-:-:-:-:00 @P2 ST.E.128 [Out + 4x<00>], cs0;
+-:-:-:-:00 @P3 ST.E.128 [Out + 4x<32>], cs4;
+
+-:-:-:-:00 IADD   Out0.CC, Out0, MPQN1;
+-:-:-:-:00 IADD.X Out1,    Out1, RZ;
+
+-:-:-:-:00 RET;
+
diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.cu b/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.cu
new file mode 100644
index 0000000..fc3ff39
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.cu
@@ -0,0 +1,52 @@
+extern "C"
+__global__ void sconv_bprop_C1_N64 (
+    float* param_test,
+    float* param_I,
+    const float*  param_F,
+    const float*  param_E,
+    float param_alpha,
+    int param_N,
+    int param_K,
+    int param_D,
+    int param_H,
+    int param_W,
+    int param_WN,
+    int param_HWN,
+    int param_DHWN,
+    int param_C,
+    int param_CRST,
+    int param_RST,
+    int param_magic_RST,
+    int param_shift_RST,
+    int param_RS,
+    int param_magic_RS,
+    int param_shift_RS,
+    int param_S,
+    int param_magic_S,
+    int param_shift_S,
+    int param_pad_d,
+    int param_pad_h,
+    int param_pad_w,
+    int param_str_d,
+    int param_str_h,
+    int param_str_w,
+    int param_Q,
+    int param_PQ,
+    int param_QN,
+    int param_PQN,
+    int param_MPQN,
+    int param_magic_Q,
+    int param_shift_Q,
+    int param_magic_PQ,
+    int param_shift_PQ,
+    int param_CRST8,
+    int param_MPQN8) {
+      __shared__ float shared[64 * 8 * 4 * 2];
+
+      int tid = threadIdx.x;
+
+      shared[tid] = 1;
+
+      *param_I = shared[31 - tid];
+      *param_test = shared[31 - tid];
+    }
diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.sass b/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.sass
new file mode 100644
index 0000000..ab26e12
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.sass
@@ -0,0 +1,805 @@
+# Kernel: sconv_bprop_C1_N64
+
+// debug:
+// mode1
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//-:-:-:-:00 SHL tmp_shl, tid, 0x2;
+//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0;
+//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1;
+//-:-:-:-:00 I2F.F32.U32 rst, rst;
+//-:-:-:-:00 ST.E [tmp_param00], rst;
+//-:-:-:-:00 EXIT;
+
+// mode2
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//
+//-:-:-:-:00 MOV32I k, 0x40000000;
+//-:-:-:-:00 ST.E [tmp_param0], k;
+//-:-:-:-:00 EXIT;
+
+// modify steps:
+// IMAD->IMAD
+// shared memory addresses->RZ
+// LDG->LD
+// LEA->MOV, IADD, SHL
+// IMAD.LO2C->IMAD.U32.U32
+// IMAD.PSL->IMAD.U32.U32
+// VMAD->IMAD, IADD
+// MOV->MOV32I
+// IADD3->IADD, IADD
+// POPC
+// LOP3
+// ST.CG->ST
+// control code
+// comments
+// LDS.U->LDS
+// register<0-7>->register<0-3>, register<4-7>
+// avoid register conflicts
+
+// optimization steps:
+// alexnet2
+// initial->227
+// bank conflict->226
+// alignment+dual issue+reuse->245
+// half ldg.128->1700
+// all ldg.128->1777
+// control codes->1900
+// scheduling->1937
+// reduce unnecessary instructions->2100
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*4 + 0>
+    addr_lut : 4x<64*8*4 + 8>
+
+    param_test[0]   : c[0x0][0x140]
+    param_test[1]   : c[0x0][0x144]
+    param_I[0]      : c[0x0][0x148]
+    param_I[1]      : c[0x0][0x14c]
+    param_E[0]      : c[0x0][0x150]
+    param_E[1]      : c[0x0][0x154]
+    param_F[0]      : c[0x0][0x158]
+    param_F[1]      : c[0x0][0x15c]
+    param_alpha     : c[0x0][0x160]
+    param_N         : c[0x0][0x164]
+    param_K         : c[0x0][0x168]
+    param_D         : c[0x0][0x16c]
+    param_H         : c[0x0][0x170]
+    param_W         : c[0x0][0x174]
+    param_WN        : c[0x0][0x178]
+    param_HWN       : c[0x0][0x17c]
+    param_DHWN      : c[0x0][0x180]
+    param_C         : c[0x0][0x184]
+    param_CRST      : c[0x0][0x188]
+    param_RST       : c[0x0][0x18c]
+    param_magic_RST : c[0x0][0x190]
+    param_shift_RST : c[0x0][0x194]
+    param_RS        : c[0x0][0x198]
+    param_magic_RS  : c[0x0][0x19c]
+    param_shift_RS  : c[0x0][0x1a0]
+    param_S         : c[0x0][0x1a4]
+    param_magic_S   : c[0x0][0x1a8]
+    param_shift_S   : c[0x0][0x1ac]
+    param_pad_d     : c[0x0][0x1b0]
+    param_pad_h     : c[0x0][0x1b4]
+    param_pad_w     : c[0x0][0x1b8]
+    param_str_d     : c[0x0][0x1bc]
+    param_str_h     : c[0x0][0x1c0]
+    param_str_w     : c[0x0][0x1c4]
+    param_Q         : c[0x0][0x1c8]
+    param_PQ        : c[0x0][0x1cc]
+    param_QN        : c[0x0][0x1d0]
+    param_PQN       : c[0x0][0x1d4]
+    param_MPQN      : c[0x0][0x1d8]
+    param_magic_Q   : c[0x0][0x1dc]
+    param_shift_Q   : c[0x0][0x1e0]
+    param_magic_PQ  : c[0x0][0x1e4]
+    param_shift_PQ  : c[0x0][0x1e8]
+    param_CRST8     : c[0x0][0x1ec]
+    param_MPQN8     : c[0x0][0x1f0]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+      64-67 ~ blkE, blkF, blkMPQ
+
+     68-95 ~ k<0|4>, tidX, tid1, pq, m, p, q, crst, crst1, crst2, crst3, n, n32, tf<0|4>, te, te<0|4>
+
+      64-67 : j0Fy<0-3>
+      68-71 : j0Ex<0-3>
+      72-75 : j0Fy<4-7>
+      76-79 : j0Ex<4-7>
+      80-83 : j1Fy<0-3>
+      84-87 : j1Ex<0-3>
+      88-91 : j1Fy<4-7>
+      92-95 : j1Ex<4-7>
+
+      96-99 : load0F<0-3>
+    100-103 : load4F<0-3>
+    104-107 : load0E<0-3>
+    108-111 : load0E<4-7>
+    112-115 : load4E<0-3>
+    116-119 : load4E<4-7>
+
+    120-123 : track0F<0-1>, track4F<0-1>
+    124-127 : track0E<0-1>, track4E<0-1>
+
+    128-131 ~ writeEs, writeFs, swapBuf, K
+    132-136 ~ readEs, readFs, mt, pr, qs
+    137-142 : tmp_data, tmp_shl, tmp_param0, tmp_param1, p_and, tid 
+    144-145 : tmp_param0<0-1>
+
+     68-71  ~ lutStore, sliceI
+     72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD
+
+     72-89  : c<0-7>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1>
+     90-132 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,    SR_TID.X; // 0 : 1 : 31
+-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; // m, p, q
+-:-:-:-:00 S2R blkF,   SR_CTAID.Y; // crst
+-:-:-:-:00 S2R blkE,   SR_CTAID.Z; // N
+
+// [4][3][2][1][0]
+// tidX = (tid & 7) << 2
+// tidX = 0 : 4 : 28
+// k0 = tid >> 3
+// k0 = 0 : 1 : 3
+// k4 = 4 : 1 : 7
+-:-:-:-:00 LOP.AND tidX, tid,  7;
+-:-:-:-:00 SHL     tidX, tidX, 2;
+-:-:-:-:00 SHR.U32 k0,   tid,  3;
+-:-:-:-:00 IADD    k4,   k0,   4;
+
+-:-:-:-:00 MOV K, param_K;
+
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// m  = blkMPQ / PQ
+// pq = blkMPQ % PQ
+-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ;
+-:-:-:-:00 SHR.U32   m, m,      param_shift_PQ;
+-:-:-:-:00 IMAD pq,  m, param_PQ, RZ;
+-:-:-:-:00 IADD pq, -pq, blkMPQ;
+// p = pq / Q
+// q = pq % Q
+-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ;
+-:-:-:-:00 SHR.U32   p, p,  param_shift_Q;
+-:-:-:-:00 IMAD  q,  p, param_Q, RZ;
+-:-:-:-:00 IADD  q, -q, pq;
+
+// mt = m * w - pad_d
+// pr = p * u - pad_h
+// qs = q * v - pad_w
+-:-:-:-:00 IMAD mt, m,   param_str_d, RZ;
+-:-:-:-:00 IMAD pr, p,   param_str_h, RZ;
+-:-:-:-:00 IMAD qs, q,   param_str_w, RZ;
+-:-:-:-:00 IADD mt, mt, -param_pad_d;
+-:-:-:-:00 IADD pr, pr, -param_pad_h;
+-:-:-:-:00 IADD qs, qs, -param_pad_w;
+
+// crst = blkF * 32 + tidX
+// n    = blkE * 64 + tidX
+-:-:-:-:00 ISCADD crst, blkF, tidX, 5;
+-:-:-:-:00 IADD crst1, crst, 1;
+-:-:-:-:00 IADD crst2, crst, 2;
+-:-:-:-:00 IADD crst3, crst, 3;
+-:-:-:-:00 ISCADD n, blkE, tidX, 6;
+-:-:-:-:00 IADD   n32, n, 32;
+
+// trackF = k * CRST + crst
+// k0 = 0 : 1 : 3
+// k4 = 4 : 1 : 7
+// tf0 = k0 * CRST + crst
+// tf4 = k4 * CRST + crst
+-:-:-:-:00 IMAD tf0, k0, param_CRST, crst;
+-:-:-:-:00 IMAD tf4, k4, param_CRST, crst;
+
+//-:-:-:-:00 LEA      track0F0.CC, tf0, param_F[0],     2;
+//-:-:-:-:00 LEA.HI.X track0F1,    tf0, param_F[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_F[0];
+-:-:-:-:00 MOV tmp_param1, param_F[1];
+-:-:-:-:00 SHL tmp_shl, tf0, 0x2;
+-:-:-:-:00 IADD track0F0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X track0F1, RZ, tmp_param1;
+
+//-:-:-:-:00 LEA      track4F0.CC, tf4, param_F[0],     2;
+//-:-:-:-:00 LEA.HI.X track4F1,    tf4, param_F[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_F[0];
+-:-:-:-:00 MOV tmp_param1, param_F[1];
+-:-:-:-:00 SHL tmp_shl, tf4, 0x2;
+-:-:-:-:00 IADD track4F0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X track4F1, RZ, tmp_param1;
+
+// trackE = k * MPQN + m * PQN + p * QN + q * N + n
+-:-:-:-:00 IMAD te, q, param_N, n;
+-:-:-:-:00 IMAD.U32.U32 te, p, param_QN, te;
+-:-:-:-:00 IMAD.U32.U32 te, m, param_PQN, te;
+-:-:-:-:00 IMAD.U32.U32 te0, k0, param_MPQN, te;
+-:-:-:-:00 IMAD.U32.U32 te4, k4, param_MPQN, te;
+//-:-:-:-:00 LEA       track0E0.CC, te0, param_E[0],     2;
+//-:-:-:-:00 LEA.HI.X  track0E1,    te0, param_E[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_E[0];
+-:-:-:-:00 MOV tmp_param1, param_E[1];
+-:-:-:-:00 SHL tmp_shl, te0, 0x2;
+-:-:-:-:00 IADD track0E0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X track0E1, RZ, tmp_param1;
+//-:-:-:-:00 LEA       track4E0.CC, te4, param_E[0],     2;
+//-:-:-:-:00 LEA.HI.X  track4E1,    te4, param_E[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_E[0];
+-:-:-:-:00 MOV tmp_param1, param_E[1];
+-:-:-:-:00 SHL tmp_shl, te4, 0x2;
+-:-:-:-:00 IADD track4E0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X track4E1, RZ, tmp_param1;
+
+// P1 = crst < CRST
+// P2 = n < N
+// P3 = n + 32 < N
+-:-:-:-:00 ISETP.LT.AND P2, PT, n,    param_N,    PT;
+-:-:-:-:00 ISETP.LT.AND P3, PT, n32,  param_N,    PT;
+
+// writeFs = (32 * k + tidX) * 4
+// tidX = 0 : 4 : 28
+// k    = 0 : 1 : 3
+// -------------
+// -------------
+// -------------
+// ------------- k * 32
+// ------ tidX
+-:-:-:-:00 ISCADD  writeFs, k0, tidX, 5;
+-:-:-:-:00 SHL     writeFs, writeFs,  2;
+// writeEs = (64 * k + tidX) * 4 + 32 * 8 * 4
+// tidX = 0 : 4 : 28
+// k    = 0 : 1 : 3
+// -------------
+// -------------
+// -------------
+// ------------- k * 64
+// ------ tidX
+-:-:-:-:00 ISCADD  writeEs, k0, tidX, 6;
+-:-:-:-:00 ISCADD  writeEs, writeEs, 4x<32*8>, 2;
+
+// readFs = (((tid & -16) >> 3) | (tid & 1)) << 4;
+// readFs = [4][0] * 4
+-:-:-:-:00 LOP.AND tid1,   tid,    1;
+-:-:-:-:00 LOP.AND readFs, tid,   -16;
+-:-:-:-:00 SHR.U32 readFs, readFs, 3;
+-:-:-:-:00 LOP.OR  readFs, readFs, tid1;
+-:-:-:-:00 SHL     readFs, readFs, 4;
+
+// readEs = ((tid >> 1) & 7) << 4
+// readEs = [3][2][1] * 4
+-:-:-:-:00 BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 ISCADD  readEs, readEs, 4x<32*8>, 4;
+
+-:-:-:-:00 MOV32I swapBuf, 4x<32*8 + 64*8>;
+
+-:-:-:-:00 IADD K, K, -8;
+
+// CRST
+// load0F0-load0F3
+-:-:-:-:00 ISETP.LT.AND P4, PT, crst, param_CRST, PT;
+-:-:-:-:00 ISETP.LT.AND P5, PT, crst1, param_CRST, PT;
+-:-:-:-:00 ISETP.LT.AND P6, PT, crst2, param_CRST, PT;
+-:-:-:-:00 ISETP.LT.AND P1, PT, crst3, param_CRST, PT;
+
+-:-:-:-:00 @P4 LD.E.CI load0F0, [track0F + 4x<0>];
+-:-:-:-:00 @P5 LD.E.CI load0F1, [track0F + 4x<1>];
+-:-:-:-:00 @P6 LD.E.CI load0F2, [track0F + 4x<2>];
+-:-:-:-:00 @P1 LD.E.CI load0F3, [track0F + 4x<3>];
+-:-:-:-:00 @!P4 LDS.32 load0F0, [RZ + addr_zero];
+-:-:-:-:00 @!P5 LDS.32 load0F1, [RZ + addr_zero];
+-:-:-:-:00 @!P6 LDS.32 load0F2, [RZ + addr_zero];
+-:-:-:-:00 @!P1 LDS.32 load0F3, [RZ + addr_zero];
+
+-:-:-:-:00 @P4 LD.E.CI load4F0, [track4F + 4x<0>];
+-:-:-:-:00 @P5 LD.E.CI load4F1, [track4F + 4x<1>];
+-:-:-:-:00 @P6 LD.E.CI load4F2, [track4F + 4x<2>];
+-:-:-:-:00 @P1 LD.E.CI load4F3, [track4F + 4x<3>];
+-:-:-:-:00 @!P4 LDS.32 load4F0, [RZ + addr_zero];
+-:-:-:-:00 @!P5 LDS.32 load4F1, [RZ + addr_zero];
+-:-:-:-:00 @!P6 LDS.32 load4F2, [RZ + addr_zero];
+-:-:-:-:00 @!P1 LDS.32 load4F3, [RZ + addr_zero];
+
+// N
+// load0E0-load0E3
+-:-:-:-:00 @P2 LD.E.128 load0E0, [track0E + 4x< 0>];
+-:-:-:-:00 @P3 LD.E.128 load0E4, [track0E + 4x<32>];
+-:-:-:-:00 @P2 LD.E.128 load4E0, [track4E + 4x< 0>];
+-:-:-:-:00 @P3 LD.E.128 load4E4, [track4E + 4x<32>];
+
+-:-:-:-:00 ISETP.GT.AND P2, PT, K, RZ, P2;
+-:-:-:-:00 ISETP.GT.AND P3, PT, K, RZ, P3;
+
+-:-:-:-:00 STS.128 [writeFs + 4x<0*32>], load0F;
+-:-:-:-:00 IADD   track0F0.CC, track0F0, param_CRST8;
+-:-:-:-:00 IADD.X track0F1,    track0F1, RZ;
+
+-:-:-:-:00 STS.128 [writeFs + 4x<4*32>], load4F;
+-:-:-:-:00 IADD   track4F0.CC, track4F0, param_CRST8;
+-:-:-:-:00 IADD.X track4F1,    track4F1, RZ;
+// mode1
+// -:-:-:-:00 MOV tmp_param0, param_test[0];
+// -:-:-:-:00 MOV tmp_param1, param_test[1];
+// -:-:-:-:00 SHL tmp_shl, tid, 0x2;
+// -:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0;
+// -:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1;
+// -:-:-:-:00 MOV tmp_data, param_CRST8;
+// -:-:-:-:00 I2F.F32.U32 tmp_data, tmp_data;
+// -:-:-:-:00 ST.E [tmp_param00], tmp_data;
+// -:-:-:-:00 EXIT;
+
+-:-:-:-:00 STS.128 [writeEs + 4x<0*64 +  0>], load0E0;
+-:-:-:-:00 STS.128 [writeEs + 4x<0*64 + 32>], load0E4;
+-:-:-:-:00 IADD   track0E0.CC, track0E0, param_MPQN8;
+-:-:-:-:00 IADD.X track0E1,    track0E1, RZ;
+
+-:-:-:-:00 STS.128 [writeEs + 4x<4*64 +  0>], load4E0;
+-:-:-:-:00 STS.128 [writeEs + 4x<4*64 + 32>], load4E4;
+-:-:-:-:00 IADD   track4E0.CC, track4E0, param_MPQN8;
+-:-:-:-:00 IADD.X track4E1,    track4E1, RZ;
+
+-:-:-:-:00 IADD writeEs, writeEs, swapBuf;
+-:-:-:-:00 IADD writeFs, writeFs, swapBuf;
+-:-:-:-:00 IADD swapBuf, RZ, -swapBuf;
+
+-:-:-:-:00 IADD K, K, -8;
+
+-:-:-:-:00 LDS.128 j0Ex0, [readEs + 4x<0*64 + 00>];
+-:-:-:-:00 LDS.128 j0Fy0, [readFs + 4x<0*32 + 00>];
+-:-:-:-:00 LDS.128 j0Ex4, [readEs + 4x<0*64 + 32>];
+-:-:-:-:00 LDS.128 j0Fy4, [readFs + 4x<0*32 + 16>];
+
+-:-:-:-:00 @P4 LD.E.CI load0F0, [track0F + 4x<0>];
+-:-:-:-:00 @P5 LD.E.CI load0F1, [track0F + 4x<1>];
+-:-:-:-:00 @P6 LD.E.CI load0F2, [track0F + 4x<2>];
+-:-:-:-:00 @P1 LD.E.CI load0F3, [track0F + 4x<3>];
+-:-:-:-:00 @!P4 LDS.32 load0F0, [RZ + addr_zero];
+-:-:-:-:00 @!P5 LDS.32 load0F1, [RZ + addr_zero];
+-:-:-:-:00 @!P6 LDS.32 load0F2, [RZ + addr_zero];
+-:-:-:-:00 @!P1 LDS.32 load0F3, [RZ + addr_zero];
+
+-:-:-:-:00 @P4 LD.E.CI load4F0, [track4F + 4x<0>];
+-:-:-:-:00 @P5 LD.E.CI load4F1, [track4F + 4x<1>];
+-:-:-:-:00 @P6 LD.E.CI load4F2, [track4F + 4x<2>];
+-:-:-:-:00 @P1 LD.E.CI load4F3, [track4F + 4x<3>];
+-:-:-:-:00 @!P4 LDS.32 load4F0, [RZ + addr_zero];
+-:-:-:-:00 @!P5 LDS.32 load4F1, [RZ + addr_zero];
+-:-:-:-:00 @!P6 LDS.32 load4F2, [RZ + addr_zero];
+-:-:-:-:00 @!P1 LDS.32 load4F3, [RZ + addr_zero];
+
+-:-:-:-:00 @P2 LD.E.128 load0E0, [track0E + 4x< 0>];
+-:-:-:-:00 @P3 LD.E.128 load0E4, [track0E + 4x<32>];
+-:-:-:-:00 @P2 LD.E.128 load4E0, [track4E + 4x< 0>];
+-:-:-:-:00 @P3 LD.E.128 load4E4, [track4E + 4x<32>];
+
+-:-:-:-:00 ISETP.GT.AND P2, PT, K, RZ, P2;
+-:-:-:-:00 ISETP.GT.AND P3, PT, K, RZ, P3;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+
+NEXT_8K:
+-:-:-:-:00 ISETP.GT.AND P0, PT, K, -8, PT;
+<CODE>
+    my %insert =
+    (
+        j0c47  => "-:-:-:-:00 IADD K, K, -8;\n",
+        j0c53 => "-:-:-:-:00 \@P0 STS.128 [writeFs + 4x<0*32>], load0F;\n",
+        j0c61 => "-:-:-:-:00 \@P0 IADD   track0F0.CC, track0F0, param_CRST8;\n",
+        j0c62 => "-:-:-:-:00 \@P0 IADD.X track0F1,    track0F1, RZ;\n",
+        j0c63 => "-:-:-:-:00 \@P4 LD.E.CI load0F0, [track0F + 4x<0>];\n",
+
+        j1c47 => "-:-:-:-:00 \@P5 LD.E.CI load0F1, [track0F + 4x<1>];\n",
+        j1c53 => "-:-:-:-:00 \@P6 LD.E.CI load0F2, [track0F + 4x<2>];\n",
+        j1c61 => "-:-:-:-:00 \@P1 LD.E.CI load0F3, [track0F + 4x<3>];\n",
+        j1c62 => "-:-:-:-:00 \@!P4 LDS.32 load0F0, [RZ + addr_zero];\n",
+        j1c63 => "-:-:-:-:00 \@!P5 LDS.32 load0F1, [RZ + addr_zero];\n",
+        
+        j2c47 => "-:-:-:-:00 \@!P6 LDS.32 load0F2, [RZ + addr_zero];\n",
+        j2c53 => "-:-:-:-:00 \@!P1 LDS.32 load0F3, [RZ + addr_zero];\n",
+        j2c61 => "-:-:-:-:00 \@P0 STS.128 [writeFs + 4x<4*32>], load4F;\n",
+        j2c62 => "-:-:-:-:00 \@P0 IADD   track4F0.CC, track4F0, param_CRST8;\n",
+        j2c63 => "-:-:-:-:00 \@P0 IADD.X track4F1,    track4F1, RZ;\n",
+
+        j3c47 => "-:-:-:-:00 \@P4 LD.E.CI load4F0, [track4F + 4x<0>];\n",
+        j3c53 => "-:-:-:-:00 \@P5 LD.E.CI load4F1, [track4F + 4x<1>];\n",
+        j3c61 => "-:-:-:-:00 \@P6 LD.E.CI load4F2, [track4F + 4x<2>];\n",
+        j3c62 => "-:-:-:-:00 \@P1 LD.E.CI load4F3, [track4F + 4x<3>];\n",
+        j3c63 => "-:-:-:-:00 \@!P4 LDS.32 load4F0, [RZ + addr_zero];\n",
+        
+        j4c47 => "-:-:-:-:00 \@!P5 LDS.32 load4F1, [RZ + addr_zero];\n",
+        j4c53 => "-:-:-:-:00 \@!P6 LDS.32 load4F2, [RZ + addr_zero];\n",
+        j4c61 => "-:-:-:-:00 \@!P1 LDS.32 load4F3, [RZ + addr_zero];\n",
+        j4c62 => "-:-:-:-:00 \@P0 STS.128 [writeEs + 4x<0*64 +  0>], load0E0;\n",
+        j4c63 => "-:-:-:-:00 \@P0 STS.128 [writeEs + 4x<0*64 + 32>], load0E4;\n",
+
+        j5c47 => "-:-:-:-:00 \@P0 IADD   track0E0.CC, track0E0, param_MPQN8;\n",
+        j5c53 => "-:-:-:-:00 \@P0 IADD.X track0E1,    track0E1, RZ;\n",
+        j5c61 => "-:-:-:-:00 \@P2 LD.E.128 load0E0, [track0E + 4x< 0>];\n",
+        j5c62 => "-:-:-:-:00 \@P3 LD.E.128 load0E4, [track0E + 4x<32>];\n",
+        j5c63 => "-:-:-:-:00 \@P0 STS.128 [writeEs + 4x<4*64 +  0>], load4E0;\n",
+
+        j6c47 => "-:-:-:-:00 \@P0 STS.128 [writeEs + 4x<4*64 + 32>], load4E4;\n",
+        j6c53 => "-:-:-:-:00 \@P0 IADD   track4E0.CC, track4E0, param_MPQN8;\n",
+        j6c61 => "-:-:-:-:00 \@P0 IADD.X track4E1,    track4E1, RZ;\n",
+        j6c62 => "-:-:-:-:00 \@P2 LD.E.128 load4E0, [track4E + 4x< 0>];\n",
+        j6c63 => "-:-:-:-:00 \@P3 LD.E.128 load4E4, [track4E + 4x<32>];\n".
+                 "-:-:-:-:00 \@P0 IADD readEs,  readEs, -swapBuf;\n" .
+                 "-:-:-:-:00 \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                 "-:-:-:-:00 \@P0 IADD writeEs, writeEs, swapBuf;\n" .
+                 "-:-:-:-:00 \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                 "-:-:-:-:00 \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c47 => "-:-:-:-:00 ISETP.GT.AND P4, PT, K, RZ, P4;\n",
+        j7c53 => "-:-:-:-:00 ISETP.GT.AND P5, PT, K, RZ, P5;\n",
+        j7c61 => "-:-:-:-:00 ISETP.GT.AND P6, PT, K, RZ, P6;\n",
+        j7c62 => "-:-:-:-:00 ISETP.GT.AND P1, PT, K, RZ, P1;\n",
+        j7c63 => "-:-:-:-:00 ISETP.GT.AND P2, PT, K, RZ, PT;\n".
+                 "-:-:-:-:00 ISETP.GT.AND P3, PT, K, RZ, PT;\n".
+                 "-:-:-:-:00 \@P0 BRA.U NEXT_8K;\n",
+    );
+
+    my @cOrder;
+
+    push  @cOrder, [0,0];
+    push  @cOrder, [0,1];
+    push  @cOrder, [1,1];
+    push  @cOrder, [2,0];
+    push  @cOrder, [1,0];
+    push  @cOrder, [2,1];
+    push  @cOrder, [2,3];
+    push  @cOrder, [2,2];
+    push  @cOrder, [1,2];
+    push  @cOrder, [0,3];
+    push  @cOrder, [1,3];
+    push  @cOrder, [0,2];
+    push  @cOrder, [0,4];
+    push  @cOrder, [0,5];
+    push  @cOrder, [1,5];
+    push  @cOrder, [2,4];
+    push  @cOrder, [1,4];
+    push  @cOrder, [2,5];
+    push  @cOrder, [2,7];
+    push  @cOrder, [2,6];
+    push  @cOrder, [1,6];
+    push  @cOrder, [0,7];
+    push  @cOrder, [1,7];
+    push  @cOrder, [0,6];
+    push  @cOrder, [3,6];
+    push  @cOrder, [3,7];
+    push  @cOrder, [4,7];
+    push  @cOrder, [5,6];
+    push  @cOrder, [4,6];
+    push  @cOrder, [5,7];
+    push  @cOrder, [5,5];
+    push  @cOrder, [5,4];
+    push  @cOrder, [4,4];
+    push  @cOrder, [3,5];
+    push  @cOrder, [4,5];
+    push  @cOrder, [3,4];
+    push  @cOrder, [3,2];
+    push  @cOrder, [3,3];
+    push  @cOrder, [4,3];
+    push  @cOrder, [5,2];
+    push  @cOrder, [4,2];
+    push  @cOrder, [5,3];
+    push  @cOrder, [5,1];
+    push  @cOrder, [5,0];
+    push  @cOrder, [4,0];
+    push  @cOrder, [3,1];
+    push  @cOrder, [4,1];
+    push  @cOrder, [3,0];
+    push  @cOrder, [6,0];
+    push  @cOrder, [7,0];
+    push  @cOrder, [7,1];
+    push  @cOrder, [6,2];
+    push  @cOrder, [6,1];
+    push  @cOrder, [7,2];
+    push  @cOrder, [7,5];
+    push  @cOrder, [6,5];
+    push  @cOrder, [6,4];
+    push  @cOrder, [7,3];
+    push  @cOrder, [7,4];
+    push  @cOrder, [6,3];
+    push  @cOrder, [6,6];
+    push  @cOrder, [6,7];
+    push  @cOrder, [7,7];
+    push  @cOrder, [7,6]; 
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c5"}  = sprintf "-:G:D:-:01 %s LDS.64 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dEx2, [readEs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dEx6, [readEs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy2, [readFs + 4x<%d*32 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy6, [readFs + 4x<%d*32 + 18>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $ctrl   = "-:-:-:-:00";
+
+            if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) {
+              $ins = "-:G:D:-:00 NOP;\n";   
+            }
+
+            if ($c > 60 && !$ins){
+              $ins = "-:-:D:-:07 NOP;\n";
+            }
+
+            # 04 and 05 are dual issued
+            if($ins) {
+              $ctrl = "-:-:D:-:04";
+            } else {
+              if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){
+                $ctrl = "-:-:D:-:04";
+              }
+              else{
+                $ctrl = "-:-:D:-:05";
+              }
+            }
+
+            $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 MOV32I warp_cnt, 32;
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkF, SR_CTAID.Y;
+-:-:-:-:00 S2R blkE, SR_CTAID.Z;
+-:-:-:-:00 MOV rst,  tid;
+
+LUT_LOOP:
+
+// warp synchronous loop while warp_cnt < RST (c=0)
+-:-:-:-:00 ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT;
+-:-:-:-:00 IADD warp_cnt, warp_cnt, 32;
+// t =  rst / RS
+// rs = rst % RS
+-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ;
+-:-:-:-:00 SHR.U32   t, t, param_shift_RS;
+-:-:-:-:00 IMAD  rs, t, param_RS, RZ;
+-:-:-:-:00 IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ;
+-:-:-:-:00 SHR.U32   r, r, param_shift_S;
+-:-:-:-:00 IMAD s, r, param_S, RZ;
+-:-:-:-:00 IADD s, -s, rs;
+// x = qs + s
+// y = pr + r
+// z = mt + t
+-:-:-:-:00 IADD z, mt, t;
+-:-:-:-:00 IADD y, pr, r;
+-:-:-:-:00 IADD x, qs, s;
+// i = (z*HWN + y*WN + x*N) * 4
+-:-:-:-:00 IMAD.U32.U32 sliceI, z, param_HWN, RZ;
+-:-:-:-:00 IMAD.U32.U32 sliceI, y, param_WN,  sliceI;
+-:-:-:-:00 IMAD sliceI, x, param_N,   sliceI;
+-:-:-:-:00 SHL  sliceI, sliceI, 2;
+// Bounds check x and y, and make i negative if outside
+-:-:-:-:00 ISET.LT.AND x0, x, RZ, PT;
+-:-:-:-:00 ISET.GE.AND xW, x,  param_W, PT;
+-:-:-:-:00 ISET.LT.AND y0, y, RZ, PT;
+-:-:-:-:00 ISET.GE.AND yH, y,  param_H, PT;
+-:-:-:-:00 ISET.LT.AND z0, z, RZ, PT;
+-:-:-:-:00 ISET.GE.AND zD, z,  param_D, PT;
+// if x0 || xW || y0 || yH || z0 || zD then sliceI = -1
+//-:-:-:-:00 LOP3.LUT sliceI, sliceI, x0, xW, 0xfe;
+//-:-:-:-:00 LOP3.LUT sliceI, sliceI, y0, yH, 0xfe;
+//-:-:-:-:00 LOP3.LUT sliceI, sliceI, z0, zD, 0xfe;
+-:-:-:-:00 LOP.OR tmp_data, x0, xW;
+-:-:-:-:00 LOP.OR tmp_data, tmp_data, y0;
+-:-:-:-:00 LOP.OR tmp_data, tmp_data, yH;
+-:-:-:-:00 LOP.OR tmp_data, tmp_data, z0;
+-:-:-:-:00 LOP.OR tmp_data, tmp_data, zD;
+-:-:-:-:00 LOP.OR sliceI, tmp_data, sliceI;
+
+-:-:-:-:00 SHL lutStore, rst, 2;
+-:-:-:-:00 IADD rst, rst, 32;
+// Store i imgOffset into the shared lookup table
+-:-:-:-:00 STS [lutStore + addr_lut], sliceI;
+
+-:-:-:-:00 @P0 BRA.U LUT_LOOP;
+
+-:-:-:-:00 MOV RST,   param_RST;
+-:-:-:-:00 MOV DHWN1, param_DHWN;
+-:-:-:-:00 SHL DHWN1, DHWN1, 2;
+
+-:-:-:-:00 LOP.AND readEs, readEs, 0x7f;
+-:-:-:-:00 LOP.AND readFs, readFs, 0x3f;
+
+// writeCs = ((readIs / 4) * 64 + readEs);
+-:-:-:-:00 ISCADD  writeCs, readFs, readEs, 4;
+
+// readCs = (tid & 31) << 2;
+-:-:-:-:00 LOP.AND tid31,  tid,   31;
+-:-:-:-:00 SHL     readCs, tid31, 2;
+
+// nn = blkE * 64 + tid31;
+-:-:-:-:00 ISCADD nn, blkE, tid31, 6;
+
+// crst = blkF * 32
+-:-:-:-:00 SHL  crst00, blkF,   5;
+-:-:-:-:00 IADD crst04, crst00, 4;
+-:-:-:-:00 IADD crst08, crst00, 8;
+-:-:-:-:00 IADD crst12, crst00, 12;
+
+// -:-:-:-:00 LEA      trackI0.CC, nn, param_I[0],     2;
+// -:-:-:-:00 LEA.HI.X trackI1,    nn, param_I[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_I[0];
+-:-:-:-:00 MOV tmp_param1, param_I[1];
+-:-:-:-:00 SHL tmp_shl, nn, 0x2;
+-:-:-:-:00 IADD trackI0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackI1, RZ, tmp_param1;
+
+// n < N
+-:-:-:-:00 ISETP.LT.AND P5, PT, nn, param_N, PT;
+-:-:-:-:00 IADD nn, nn, 32;
+-:-:-:-:00 ISETP.LT.AND P6, PT, nn, param_N, PT;
+
+-:-:-:-:00 MOV alpha, param_alpha;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "-:-:-:-:00 IADD crst00, crst00, 12;\n" .
+            "-:-:-:-:00 IADD crst04, crst04, 12;\n" .
+            "-:-:-:-:00 IADD crst08, crst08, 12;\n" .
+            "-:-:-:-:00 IADD crst12, crst12, 12;\n" if $y == 4;
+
+        $out .= sprintf(
+            "-:-:-:-:00 FMUL c0, cx0y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c1, cx1y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c2, cx2y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c3, cx3y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c4, cx4y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c5, cx5y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c6, cx6y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "-:-:-:-:00 CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 EXIT;
+
+STORE_C:
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+-:-:-:-:00 STS.128 [writeCs+4x<00>], c0;
+-:-:-:-:00 STS.128 [writeCs+4x<32>], c4;
+
+-:-:-:-:00 LDS c0, [readCs + 4x<0*64 + 00>];
+-:-:-:-:00 LDS c1, [readCs + 4x<0*64 + 32>];
+-:-:-:-:00 LDS c2, [readCs + 4x<1*64 + 00>];
+-:-:-:-:00 LDS c3, [readCs + 4x<1*64 + 32>];
+-:-:-:-:00 LDS c4, [readCs + 4x<2*64 + 00>];
+-:-:-:-:00 LDS c5, [readCs + 4x<2*64 + 32>];
+-:-:-:-:00 LDS c6, [readCs + 4x<3*64 + 00>];
+-:-:-:-:00 LDS c7, [readCs + 4x<3*64 + 32>];
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, crst00, param_CRST, P5;
+-:-:-:-:00 ISETP.LT.AND P1, PT, crst04, param_CRST, P5;
+-:-:-:-:00 ISETP.LT.AND P2, PT, crst08, param_CRST, P5;
+-:-:-:-:00 ISETP.LT.AND P3, PT, crst12, param_CRST, P5;
+
+// c00 = crst00 / RST
+// lut00 = crst00 % RST
+-:-:-:-:00 IMAD.U32.U32 c00, crst00, param_magic_RST, RZ;
+-:-:-:-:00 IMAD.U32.U32 c04, crst04, param_magic_RST, RZ;
+-:-:-:-:00 IMAD.U32.U32 c08, crst08, param_magic_RST, RZ;
+-:-:-:-:00 IMAD.U32.U32 c12, crst12, param_magic_RST, RZ;
+
+-:-:-:-:00 SHR.U32 c00, c00, param_shift_RST;
+-:-:-:-:00 SHR.U32 c04, c04, param_shift_RST;
+-:-:-:-:00 SHR.U32 c08, c08, param_shift_RST;
+-:-:-:-:00 SHR.U32 c12, c12, param_shift_RST;
+
+//-:-:-:-:00 VMAD.U16.U16 lut00, -c00, RST, crst00;
+-:-:-:-:00 IMAD lut00, -c00, RST, RZ;
+-:-:-:-:00 IADD lut00, lut00, crst00;
+//-:-:-:-:00 VMAD.U16.U16 lut04, -c04, RST, crst04;
+-:-:-:-:00 IMAD lut04, -c04, RST, RZ;
+-:-:-:-:00 IADD lut04, lut04, crst04;
+//-:-:-:-:00 VMAD.U16.U16 lut08, -c08, RST, crst08;
+-:-:-:-:00 IMAD lut08, -c08, RST, RZ;
+-:-:-:-:00 IADD lut08, lut08, crst08;
+//-:-:-:-:00 VMAD.U16.U16 lut12, -c12, RST, crst12;
+-:-:-:-:00 IMAD lut12, -c12, RST, RZ;
+-:-:-:-:00 IADD lut12, lut12, crst12;
+
+-:-:-:-:00 SHL lut00, lut00, 2;
+-:-:-:-:00 SHL lut04, lut04, 2;
+-:-:-:-:00 SHL lut08, lut08, 2;
+-:-:-:-:00 SHL lut12, lut12, 2;
+
+-:-:-:-:00 IMAD.U32.U32 chan00, DHWN1, c00, RZ;
+-:-:-:-:00 IMAD.U32.U32 chan04, DHWN1, c04, RZ;
+-:-:-:-:00 IMAD.U32.U32 chan08, DHWN1, c08, RZ;
+-:-:-:-:00 IMAD.U32.U32 chan12, DHWN1, c12, RZ;
+
+-:-:-:-:00 IADD crst00, crst00, 1;
+-:-:-:-:00 IADD crst04, crst04, 1;
+-:-:-:-:00 IADD crst08, crst08, 1;
+-:-:-:-:00 IADD crst12, crst12, 1;
+
+-:-:-:-:00 @P0 LDS img00, [lut00 + addr_lut];
+-:-:-:-:00 @P1 LDS img04, [lut04 + addr_lut];
+-:-:-:-:00 @P2 LDS img08, [lut08 + addr_lut];
+-:-:-:-:00 @P3 LDS img12, [lut12 + addr_lut];
+
+-:-:-:-:00 ISETP.GE.AND P0, PT, img00, RZ, P0;
+-:-:-:-:00 IADD tmp_data, img00, chan00;
+-:-:-:-:00 IADD track00I0.CC, trackI0, tmp_data;
+-:-:-:-:00 IADD.X track00I1, trackI1, RZ;
+
+-:-:-:-:00 ISETP.GE.AND P1, PT, img04, RZ, P1;
+-:-:-:-:00 IADD tmp_data, img04, chan04;
+-:-:-:-:00 IADD track04I0.CC, trackI0, tmp_data;
+-:-:-:-:00 IADD.X track04I1, trackI1, RZ;
+
+-:-:-:-:00 ISETP.GE.AND P2, PT, img08, RZ, P2;
+-:-:-:-:00 IADD tmp_data, img08, chan08;
+-:-:-:-:00 IADD track08I0.CC, trackI0, tmp_data;
+-:-:-:-:00 IADD.X track08I1, trackI1, RZ;
+
+-:-:-:-:00 ISETP.GE.AND P3, PT, img12, RZ, P3;
+-:-:-:-:00 IADD tmp_data, img12, chan12;
+-:-:-:-:00 IADD track12I0.CC, trackI0, tmp_data;
+-:-:-:-:00 IADD.X track12I1,    trackI1, RZ;
+
+-:-:-:-:00 @P0 RED.E.ADD.F32.FTZ.RN [track00I], c0;
+-:-:-:-:00     PSETP.AND.AND P0, PT, P0, P6, PT;
+-:-:-:-:00 @P1 RED.E.ADD.F32.FTZ.RN [track04I], c2;
+-:-:-:-:00     PSETP.AND.AND P1, PT, P1, P6, PT;
+-:-:-:-:00 @P2 RED.E.ADD.F32.FTZ.RN [track08I], c4;
+-:-:-:-:00     PSETP.AND.AND P2, PT, P2, P6, PT;
+-:-:-:-:00 @P3 RED.E.ADD.F32.FTZ.RN [track12I], c6;
+-:-:-:-:00     PSETP.AND.AND P3, PT, P3, P6, PT;
+
+-:-:-:-:00 @P0 RED.E.ADD.F32.FTZ.RN [track00I + 4x<32>], c1;
+-:-:-:-:00 @P1 RED.E.ADD.F32.FTZ.RN [track04I + 4x<32>], c3;
+-:-:-:-:00 @P2 RED.E.ADD.F32.FTZ.RN [track08I + 4x<32>], c5;
+-:-:-:-:00 @P3 RED.E.ADD.F32.FTZ.RN [track12I + 4x<32>], c7;
+
+-:-:-:-:00 RET;
+
diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.cu b/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.cu
new file mode 100644
index 0000000..92ce953
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.cu
@@ -0,0 +1,56 @@
+extern "C"
+__global__ void sconv_bprop_C64_N64 (
+    float* param_test,
+    float* param_O,
+    const float* param_I,
+    const float* param_F,
+    float param_alpha,
+    int param_N,
+    int param_K,
+    int param_D,
+    int param_H,
+    int param_W,
+    int param_WN,
+    int param_HWN,
+    int param_DHWN,
+    int param_C,
+    int param_CRST,
+    int param_RST,
+    int param_RS,
+    int param_magic_RS,
+    int param_shift_RS,
+    int param_S,
+    int param_magic_S,
+    int param_shift_S,
+    int param_pad_d,
+    int param_pad_h,
+    int param_pad_w,
+    int param_str_d,
+    int param_str_h,
+    int param_str_w,
+    int param_Q,
+    int param_PQ,
+    int param_QN,
+    int param_PQN,
+    int param_MPQN,
+    int param_magic_Q,
+    int param_shift_Q,
+    int param_magic_PQ,
+    int param_shift_PQ,
+    int param_R,
+    int param_T,
+    int param_magic_str_w,
+    int param_shift_str_w,
+    int param_magic_str_h,
+    int param_shift_str_h,
+    int param_magic_str_d,
+    int param_shift_str_d) {
+      __shared__ float share[64 * 8 * 4 + 8];
+
+      int tid = threadIdx.x;
+
+      share[tid] = 1;
+
+      *param_O = share[63-tid];
+      *param_test = share[63-tid];
+    }
diff --git a/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.sass b/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.sass
new file mode 100644
index 0000000..e6ddb9e
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.sass
@@ -0,0 +1,783 @@
+# Kernel: sconv_bprop_C64_N64
+// debug:
+// mode1
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//-:-:-:-:00 SHL tmp_shl, tid, 0x2;
+//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0;
+//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1;
+//-:-:-:-:00 I2F.F32.U32 rst, rst;
+//-:-:-:-:00 ST.E [tmp_param00], rst;
+//-:-:-:-:00 EXIT;
+
+// mode2
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//
+//-:-:-:-:00 MOV32I k, 0x40000000;
+//-:-:-:-:00 ST.E [tmp_param0], k;
+//-:-:-:-:00 EXIT;
+
+// modify steps:
+// XMAD->IMAD
+// shared memory addresses->RZ
+// LDG->LD
+// LEA->MOV, IADD, SHL
+// XMAD.LO2C->IMAD.U32.U32
+// XMAD.PSL->IMAD.U32.U32
+// VMAD->IMAD, IADD
+// MOV->MOV32I
+// IADD3->IADD, IADD
+// POPC
+// ST.CG->ST
+// control code
+// comments
+// LDS.U->LDS
+// register<0-7>->register<0-3>, register<4-7>
+// avoid register conflicts
+// tid->other register
+
+// optimization steps:
+// alexnet2
+// initial->1200
+// bank conflict->1300
+// alignment+dual issue+reuse->1700
+// all ldg.128->1900
+// control codes->2000
+// reduce unnecessary instructions->2100
+// scheduling->1937
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<64*8*4 + 0>
+    addr_m     : 4x<64*8*4 + 4>
+    addr_p     : 4x<64*8*4 + 5>
+    addr_q     : 4x<64*8*4 + 6>
+    addr_szLut : 4x<64*8*4 + 7>
+    addr_lut   : 4x<64*8*4 + 8>
+
+    param_test[0]     : c[0x0][0x140]
+    param_test[1]     : c[0x0][0x144]
+    param_O[0]        : c[0x0][0x148]
+    param_O[1]        : c[0x0][0x14c]
+    param_I[0]        : c[0x0][0x150]
+    param_I[1]        : c[0x0][0x154]
+    param_F[0]        : c[0x0][0x158]
+    param_F[1]        : c[0x0][0x15c]
+    param_alpha       : c[0x0][0x160]
+    param_N           : c[0x0][0x164]
+    param_K           : c[0x0][0x168]
+    param_D           : c[0x0][0x16c]
+    param_H           : c[0x0][0x170]
+    param_W           : c[0x0][0x174]
+    param_WN          : c[0x0][0x178]
+    param_HWN         : c[0x0][0x17c]
+    param_DHWN        : c[0x0][0x180]
+    param_C           : c[0x0][0x184]
+    param_KRST        : c[0x0][0x188]
+    param_RST         : c[0x0][0x18c]
+    param_RS          : c[0x0][0x190]
+    param_magic_RS    : c[0x0][0x194]
+    param_shift_RS    : c[0x0][0x198]
+    param_S           : c[0x0][0x19c]
+    param_magic_S     : c[0x0][0x1a0]
+    param_shift_S     : c[0x0][0x1a4]
+    param_pad_d       : c[0x0][0x1a8]
+    param_pad_h       : c[0x0][0x1ac]
+    param_pad_w       : c[0x0][0x1b0]
+    param_str_d       : c[0x0][0x1b4]
+    param_str_h       : c[0x0][0x1b8]
+    param_str_w       : c[0x0][0x1bc]
+    param_Q           : c[0x0][0x1c0]
+    param_PQ          : c[0x0][0x1c4]
+    param_QN          : c[0x0][0x1c8]
+    param_PQN         : c[0x0][0x1cc]
+    param_MPQN        : c[0x0][0x1d0]
+    param_magic_Q     : c[0x0][0x1d4]
+    param_shift_Q     : c[0x0][0x1d8]
+    param_magic_PQ    : c[0x0][0x1dc]
+    param_shift_PQ    : c[0x0][0x1e0]
+    param_R           : c[0x0][0x1e4]
+    param_T           : c[0x0][0x1e8]
+    param_magic_str_w : c[0x0][0x1ec]
+    param_shift_str_w : c[0x0][0x1f0]
+    param_magic_str_h : c[0x0][0x1f4]
+    param_shift_str_h : c[0x0][0x1f8]
+    param_magic_str_d : c[0x0][0x1fc]
+    param_shift_str_d : c[0x0][0x200]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-67 : mpq<0-3>
+    64-67 : m, p, q, tidY
+    68-70 : blkF, blkI, blkMPQ
+    72-95 ~ tid1, tidX
+    72-107 ~ str_d, str_h, str_w, pq, mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, one, rst_prime, x_prime, y_prime, z_prime, ballot, warp_slices, partial, endCRST
+
+    0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+      64-67 : j0Fy<0-3>
+      68-71 : j0Ix<0-3>
+      72-75 : j0Fy<4-7>
+      76-79 : j0Ix<4-7>
+      80-83 : j1Fy<0-3>
+      84-87 : j1Ix<0-3>
+      88-91 : j1Fy<4-7>
+      92-95 : j1Ix<4-7>
+
+    136-139 : offsetF, offsetIc, offsetFc
+    140-141 : sliceI, sliceF
+    140-141 : sliceIF<0-1>
+    142-145 : addressF<0-1>, addressI<0-1>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-103 : loadI<0-3>
+    104-107 : loadF<0-3>
+    108-111 : loadI<4-7>
+    112-115 : loadF<4-7>
+
+    116-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI
+    126-127 : readFs, readIs
+    128-131 : tmp_data, tmp_shl, p_and, tid 
+    132-133 : tmp_param<0-1>
+
+    72-79  : cs<0-7>
+    80-81  : Out<0-1>
+    82-125 ~ writeCs, readCs, alpha, tidOX, tidOY, to, k, n, MPQN1, MPQN28, MPQN, MPQN4
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,    SR_TID.X;
+-:-:-:-:00 S2R blkF,   SR_CTAID.Y;
+-:-:-:-:00 S2R blkI,   SR_CTAID.Z;
+-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; # m,p,q stored in x index
+
+-:-:-:-:00 ISETP.GE.AND P0, PT, tid, 32, PT;
+
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+
+<CODE>
+    return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tidX = (tid & 7) << 2
+// tidY = tid >> 3
+-:-:-:-:00 LOP.AND tidX, tid,  7;
+-:-:-:-:00 SHL     tidX, tidX, 2;
+-:-:-:-:00 SHR.U32 tidY, tid,  3;
+
+// trackF += blkF*64 + tidX
+-:-:-:-:00 ISCADD offsetFk, blkF, tidX, 6;
+
+// trackI += blkI*64 + tidX
+-:-:-:-:00 ISCADD offsetIn, blkI, tidX, 6;
+
+// writeS = (64*tidY + tidX) * 4
+-:-:-:-:00 ISCADD  writeS, tidY, tidX, 6;
+-:-:-:-:00 SHL     writeS, writeS, 2;
+
+// readFs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4;
+-:-:-:-:00 LOP.AND tid1,   tid,    1;
+-:-:-:-:00 LOP.AND readFs, tid,    0x30;
+-:-:-:-:00 SHR.U32 readFs, readFs, 3;
+-:-:-:-:00 LOP.OR  readFs, readFs, tid1;
+-:-:-:-:00 SHL     readFs, readFs, 4;
+
+// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+-:-:-:-:00 BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 ISCADD  readIs, readIs, 4x<8*64>, 4;
+
+-:-:-:-:00 @P0 BRA.U END_SETUP;
+
+-:-:-:-:00 MOV str_d, param_str_d;
+-:-:-:-:00 MOV str_h, param_str_h;
+-:-:-:-:00 MOV str_w, param_str_w;
+-:-:-:-:00 MOV rst, tid;
+-:-:-:-:00 MOV lutStore2, RZ;
+-:-:-:-:00 MOV lutSize, RZ;
+-:-:-:-:00 MOV32I warp_count, 32;
+
+// m  = blkMPQ / PQ
+// pq = blkMPQ % PQ
+-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ;
+-:-:-:-:00 SHR.U32   m, m, param_shift_PQ;
+-:-:-:-:00 IMAD      pq, m, param_PQ, RZ;
+-:-:-:-:00 IADD      pq, -pq, blkMPQ;
+// p = pq / Q
+// q = pq % Q
+-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ;
+-:-:-:-:00 SHR.U32   p, p, param_shift_Q;
+-:-:-:-:00 IMAD      q, p, param_Q, RZ;
+-:-:-:-:00 IADD      q, -q, pq;
+
+-:-:-:-:00 MOV32I dep_thd_mask, -1;
+
+-:-:-:-:00 LOP.AND p_and, p, 1;
+-:-:-:-:00 ISETP.NE.AND P1, PT, p_and, RZ, PT;
+-:-:-:-:00 @P1 IADD q, -q, param_Q;
+-:-:-:-:00 @P1 IADD q, q, dep_thd_mask;
+
+-:-:-:-:00 STS.128 [RZ + addr_m], m;
+
+// qs = q - S + pad_w + 1
+-:-:-:-:00 MOV32I one, 1;
+-:-:-:-:00 IADD qs, q, -param_S;
+-:-:-:-:00 IADD qs, qs, param_pad_w;
+-:-:-:-:00 IADD qs, qs, one;
+
+// pr = p - R + pad_h + 1
+-:-:-:-:00 IADD pr, p, -param_R;
+-:-:-:-:00 IADD pr, pr, param_pad_h;
+-:-:-:-:00 IADD pr, pr, one;
+
+// mt = m - T + pad_d + 1
+-:-:-:-:00 IADD mt, m, -param_T;
+-:-:-:-:00 IADD mt, mt, param_pad_d;
+-:-:-:-:00 IADD mt, mt, one;
+
+-:-:-:-:00 IADD mask_shr, -tid, 32;
+-:-:-:-:00 SHR.U32 dep_thd_mask, dep_thd_mask, mask_shr;
+
+LUT_LOOP:
+
+// warp synchronous loop while warp_count < RST
+-:-:-:-:00 ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
+-:-:-:-:00 IADD warp_count, warp_count, 32;
+// t =  rst / RS
+// rs = rst % RS
+-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ;
+-:-:-:-:00 SHR.U32 t, t, param_shift_RS;
+-:-:-:-:00 IMAD rs, t, param_RS, RZ;
+-:-:-:-:00 IADD rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ;
+-:-:-:-:00 SHR.U32 r, r, param_shift_S;
+-:-:-:-:00 IMAD s, r, param_S, RZ;
+-:-:-:-:00 IADD s, -s, rs;
+// x = qs + s
+// y = pr + r
+// z = mt + t
+-:-:-:-:00 IADD x, qs, s;
+-:-:-:-:00 IADD y, pr, r;
+-:-:-:-:00 IADD z, mt, t;
+-:-:-:-:00 ISETP.GE.AND  P4, PT, x, RZ, PT;
+-:-:-:-:00 ISETP.GE.AND  P5, PT, y, RZ, PT;
+-:-:-:-:00 ISETP.GE.AND  P6, PT, z, RZ, PT;
+// rst_prime = t*RS + r*S + s
+// s = S - s - 1
+-:-:-:-:00 IADD s, -s, param_S;
+-:-:-:-:00 IADD s, s, -one;
+// r = R - r - 1
+-:-:-:-:00 IADD r, -r, param_R;
+-:-:-:-:00 IADD r, r, -one;
+// t = T - t - 1
+-:-:-:-:00 IADD t, -t, param_T;
+-:-:-:-:00 IADD t, t, -one;
+
+-:-:-:-:00 IMAD  rst_prime, r, param_S,  s;
+-:-:-:-:00 IMAD  rst_prime, t, param_RS, rst_prime;
+
+// x_prime = x / str_w
+// x       = x % str_w
+-:-:-:-:00 IMAD    x_prime, x, param_magic_str_w, RZ;
+-:-:-:-:00 SHR.U32 x_prime, x_prime, param_shift_str_w;
+-:-:-:-:00 IMAD tmp_param0, str_w, x_prime, RZ;
+-:-:-:-:00 IADD x, -tmp_param0, x;
+// y_prime = y / str_h
+// y       = y % str_h
+-:-:-:-:00 IMAD    y_prime, y, param_magic_str_h, RZ;
+-:-:-:-:00 SHR.U32 y_prime, y_prime, param_shift_str_h;
+-:-:-:-:00 IMAD tmp_param0, str_h, y_prime, RZ;
+-:-:-:-:00 IADD y, -tmp_param0, y;
+// z_prime = z / str_d
+// z       = z % str_d
+-:-:-:-:00 IMAD    z_prime, z, param_magic_str_d, RZ;
+-:-:-:-:00 SHR.U32 z_prime, z_prime, param_shift_str_d;
+-:-:-:-:00 IMAD tmp_param0, str_d, z_prime, RZ;
+-:-:-:-:00 IADD z, -tmp_param0, z;
+
+// calculate x_prime only when x % str_w == 0
+// it may be greater than Q due to its location
+-:-:-:-:00 ISETP.EQ.AND  P4, PT, x, RZ, P4;
+-:-:-:-:00 ISETP.EQ.AND  P5, PT, y, RZ, P5;
+-:-:-:-:00 ISETP.EQ.AND  P6, PT, z, RZ, P6;
+-:-:-:-:00 ISETP.LT.AND  P4, PT, x_prime, param_W, P4;
+-:-:-:-:00 ISETP.LT.AND  P5, PT, y_prime, param_H, P5;
+-:-:-:-:00 ISETP.LT.AND  P6, PT, z_prime, param_D, P6;
+-:-:-:-:00 PSETP.AND.AND P1, PT, P4, P5, P6;
+
+// sliceI = z_prime*HWN + y_prime*WN + x_prime*N
+-:-:-:-:00 IMAD      sliceI, x_prime, param_N,   RZ;
+-:-:-:-:00 IMAD.U32.U32 sliceI, y_prime, param_WN,  sliceI;
+-:-:-:-:00 IMAD.U32.U32 sliceI, z_prime, param_HWN, sliceI;
+// sliceF = rst_prime * K
+-:-:-:-:00 IMAD sliceF, rst_prime, param_K, RZ;
+
+// Get a mask of all valid slices in the warp
+-:-:-:-:00 VOTE.ANY ballot, PT, P1;
+// Count the total valid slices
+-:-:-:-:00 POPC warp_slices, ballot, ballot;
+// Prepare lutStore for this and next loop
+-:-:-:-:00 @P1 MOV    lutStore, lutStore2;
+-:-:-:-:00 ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+-:-:-:-:00 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+-:-:-:-:00 @P1 POPC dep_thd_cnt, dep_thd_bits, dep_thd_bits;
+// use the rst increment to space the barrier sync
+-:-:-:-:00 IADD rst, rst, 32;
+// Update the lutStore address from this count
+-:-:-:-:00 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+-:-:-:-:00 @P1 STS.64 [lutStore + addr_lut], sliceIF;
+// Keep track of the total size of the lut
+-:-:-:-:00 IADD lutSize, lutSize, warp_slices;
+
+-:-:-:-:00 @P0 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+-:-:-:-:00 STS [RZ + addr_szLut], lutSize;
+
+END_SETUP:
+
+-:-:-:-:00 BAR.SYNC 0;
+
+// Grab the caclulated lut size and get it's reciprical
+// Get the total reduction depth
+-:-:-:-:00 LDS lutSize, [RZ + addr_szLut];
+-:-:-:-:00 IMAD endCRST, lutSize, param_C, RZ;
+-:-:-:-:00 I2F.F32.S32 lutSizeRcp, lutSize;
+-:-:-:-:00 MUFU.RCP lutSizeRcp, lutSizeRcp;
+
+// posCRST = endCRST - tidY - 1
+-:-:-:-:00 IADD posCRST, endCRST, -1;
+-:-:-:-:00 IADD posCRST, posCRST, -tidY;
+// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 8 then make a full 8 line fetch.
+-:-:-:-:00 LOP.AND partial, endCRST, 7;
+-:-:-:-:00 ISETP.EQ.AND P1, PT, RZ, partial, PT;
+-:-:-:-:00 @P1 MOV32I partial, 8;
+// channel = posCRST / lutSize
+// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
+-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST;
+-:-:-:-:00 FMUL channel, posCRSTf, lutSizeRcp;
+-:-:-:-:00 FFMA channel, channel, 5.9604644775390625e-08, channel;
+-:-:-:-:00 F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+-:-:-:-:00 IMAD tmp_param0, channel, lutSize, RZ;
+-:-:-:-:00 IADD lutOffset, -tmp_param0, posCRST;
+
+-:-:-:-:00 SHL lutOffset, lutOffset, 3;
+// P1 = tidY < partial
+-:-:-:-:00 ISETP.LT.AND P1, PT, tidY, partial, PT;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+-:-:-:-:00 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ;
+-:-:-:-:00 IMAD offsetFc, channel, param_KRST, RZ;
+// posCRST -= partial
+-:-:-:-:00 IADD posCRST, posCRST, -partial;
+-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut];
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc;
+-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF;
+// trackF = offsetFK + offsetFC + sliceF + param_F
+-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc;
+-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI;
+
+//-:-:-:-:00 @P1 LEA      trackF0.CC, offsetF, param_F[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;
+-:-:-:-:00 @P1 MOV tmp_param0, param_F[0];
+-:-:-:-:00 @P1 MOV tmp_param1, param_F[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2;
+-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 @P1 IADD.X trackF1, RZ, tmp_param1;
+//-:-:-:-:00 @P1 LEA      trackI0.CC, offsetI, param_I[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;
+-:-:-:-:00 @P1 MOV tmp_param0, param_I[0];
+-:-:-:-:00 @P1 MOV tmp_param1, param_I[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2;
+-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 @P1 IADD.X trackI1, RZ, tmp_param1;
+
+-:-:-:-:00 @P1 LD.E.CI.128 loadF0, [trackF + 4x< 0>];
+-:-:-:-:00 @P1 LD.E.CI.128 loadF4, [trackF + 4x<32>];
+-:-:-:-:00 @!P1 LDS.128 loadF0, [RZ + addr_zero];
+-:-:-:-:00 @!P1 LDS.128 loadF4, [RZ + addr_zero];
+
+-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI + 4x< 0>];
+-:-:-:-:00 @P1 LD.E.128 loadI4, [trackI + 4x<32>];
+-:-:-:-:00 @!P1 LDS.128 loadI0, [RZ + addr_zero];
+-:-:-:-:00 @!P1 LDS.128 loadI4, [RZ + addr_zero];
+
+-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT;
+
+-:-:-:-:00 STS.128 [writeS + 4x<0*64 +  0>], loadF0;
+-:-:-:-:00 STS.128 [writeS + 4x<0*64 + 32>], loadF4;
+
+-:-:-:-:00 STS.128 [writeS + 4x<8*64 +  0>], loadI0;
+-:-:-:-:00 STS.128 [writeS + 4x<8*64 + 32>], loadI4;
+
+-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST;
+
+-:-:-:-:00 BAR.SYNC 0;
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<64*8*2>;
+
+-:-:-:-:00 LDS.128 j0Ix0, [readIs + 4x<0*64 + 00>];
+-:-:-:-:00 LDS.128 j0Fy0, [readFs + 4x<0*64 + 00>];
+-:-:-:-:00 LDS.128 j0Ix4, [readIs + 4x<0*64 + 32>];
+-:-:-:-:00 LDS.128 j0Fy4, [readFs + 4x<0*64 + 32>];
+
+// channel = posCRST / lutSize
+-:-:-:-:00 @P1 FMUL channel, posCRSTf, lutSizeRcp;
+-:-:-:-:00 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel;
+-:-:-:-:00 @P1 F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+-:-:-:-:00 @P1 IMAD tmp_param0, channel, lutSize, RZ;
+-:-:-:-:00 @P1 IADD lutOffset, -tmp_param0, posCRST;
+-:-:-:-:00 @P1 SHL lutOffset, lutOffset, 3;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+-:-:-:-:00 @P1 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ;
+-:-:-:-:00 @P1 IMAD      offsetFc, channel, param_KRST, RZ;
+
+-:-:-:-:00 IADD posCRST, posCRST, -8;
+-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut];
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc;
+-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF;
+// trackF = offsetFK + offsetFC + sliceF + param_F
+-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc;
+-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI;
+//-:-:-:-:00 @P1 LEA      trackF0.CC, offsetF, param_F[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;
+-:-:-:-:00 @P1 MOV addressF0, param_F[0];
+-:-:-:-:00 @P1 MOV addressF1, param_F[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2;
+-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, addressF0;
+-:-:-:-:00 @P1 IADD.X trackF1, RZ, addressF1;
+//-:-:-:-:00 @P1 LEA      trackI0.CC, offsetI, param_I[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;
+-:-:-:-:00 @P1 MOV addressI0, param_I[0];
+-:-:-:-:00 @P1 MOV addressI1, param_I[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2;
+-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, addressI0;
+-:-:-:-:00 @P1 IADD.X trackI1, RZ, addressI1;
+-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF + 4x< 0>];
+-:-:-:-:00 @P1 LD.E.128 loadF4, [trackF + 4x<32>];
+-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI + 4x< 0>];
+-:-:-:-:00 @P1 LD.E.128 loadI4, [trackI + 4x<32>];
+-:-:-:-:00 MOV32I tmp_data, 128;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+
+LOOP:
+
+<CODE>
+    my %insert =
+    (
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c53 => "-:-:-:-:00 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+        j0c61 => "-:-:-:-:00 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+        j0c62 => "-:-:-:-:00 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c63 => "-:-:-:-:00 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+
+        j1c47 => "-:-:-:-:00 \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+        j1c53 => "-:-:-:-:00 \@P1 IMAD tmp_param0, channel, lutSize, RZ;\n",
+        j1c61 => "-:-:-:-:00 \@P1 IADD lutOffset, -tmp_param0, posCRST;\n",
+        j1c62 => "-:-:-:-:00 \@P1 IMAD offsetF, channel, param_KRST, offsetFk;\n",
+        j1c63 => "-:-:-:-:00 \@P1 IMAD offsetI, channel, param_DHWN, offsetIn;\n",
+
+        j2c47 => "-:-:-:-:00 \@P1 SHL lutOffset, lutOffset, 3;\n",
+        j2c53 => "-:-:-:-:00 IADD posCRST, posCRST, -8;\n",
+        j2c61 => "-:-:-:-:00 \@P1 LDS.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j3c47 => "-:-:-:-:00 \@P1 IADD offsetF, offsetF, sliceF;\n",
+        j3c53 => "-:-:-:-:00 TEXDEPBAR 0x0;\n",
+        j3c61 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetF, 0x2;\n",
+        j3c62 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*64 + 0>], loadF0;\n",
+        j3c63 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*64 + 32>], loadF4;\n",
+        
+        j4c47 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*64 + 0>], loadI0;\n",
+        j4c53 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*64 + 32>], loadI4;\n",
+        j4c61 => "-:-:-:-:00 \@P1 IADD trackF0.CC, tmp_shl, addressF0;\n",
+        j4c62 => "-:-:-:-:00 \@P1 IADD.X trackF1, RZ, addressF1;\n",
+        j4c63 => "-:-:-:-:00 \@P1 IADD offsetI, offsetI, sliceI;\n",
+
+        j5c47 => "-:G:D:-:00 \@P1 LDG.E.128 loadF0, [trackF];\n",
+        j5c53 => "-:-:-:-:00 \@P1 IADD tmp_param0.CC, tmp_data, trackF0;\n",
+        j5c61 => "-:-:-:-:00 \@P1 IADD.X tmp_param1, RZ, trackF1;\n",
+        j5c62 => "-:G:D:-:00 \@P1 LDG.E.128 loadF4, [tmp_param];\n",
+        j5c63 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n",
+        
+        j6c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetI, 0x2;\n",
+        j6c53 => "-:-:-:-:00 \@P1 IADD trackI0.CC, tmp_shl, addressI0;\n",
+        j6c61 => "-:-:-:-:00 \@P1 IADD.X trackI1, RZ, addressI1;\n",
+
+        j6c63 => "-:-:-:-:00 \@P0 BAR.SYNC 0;\n".
+                 "-:-:-:-:00 \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 NOP;\n",
+
+        j7c47 => "-:G:D:-:00 \@P1 LDG.E.128 loadI0, [trackI + 4x<0>];\n",
+        j7c53 => "-:-:-:-:00 \@P1 IADD tmp_param0.CC, tmp_data, trackI0;\n",
+
+        j7c61 => "-:-:-:-:00 \@P1 IADD.X tmp_param1, RZ, trackI1;\n",
+        j7c62 => "-:G:D:-:00 \@P1 LDG.E.128 loadI4, [tmp_param];\n",
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n",
+    );
+
+    my @cOrder;
+
+    push  @cOrder, [0,0];
+    push  @cOrder, [0,1];
+    push  @cOrder, [1,1];
+    push  @cOrder, [2,0];
+    push  @cOrder, [1,0];
+    push  @cOrder, [2,1];
+    push  @cOrder, [2,3];
+    push  @cOrder, [2,2];
+    push  @cOrder, [1,2];
+    push  @cOrder, [0,3];
+    push  @cOrder, [1,3];
+    push  @cOrder, [0,2];
+    push  @cOrder, [0,4];
+    push  @cOrder, [0,5];
+    push  @cOrder, [1,5];
+    push  @cOrder, [2,4];
+    push  @cOrder, [1,4];
+    push  @cOrder, [2,5];
+    push  @cOrder, [2,7];
+    push  @cOrder, [2,6];
+    push  @cOrder, [1,6];
+    push  @cOrder, [0,7];
+    push  @cOrder, [1,7];
+    push  @cOrder, [0,6];
+    push  @cOrder, [3,6];
+    push  @cOrder, [3,7];
+    push  @cOrder, [4,7];
+    push  @cOrder, [5,6];
+    push  @cOrder, [4,6];
+    push  @cOrder, [5,7];
+    push  @cOrder, [5,5];
+    push  @cOrder, [5,4];
+    push  @cOrder, [4,4];
+    push  @cOrder, [3,5];
+    push  @cOrder, [4,5];
+    push  @cOrder, [3,4];
+    push  @cOrder, [3,2];
+    push  @cOrder, [3,3];
+    push  @cOrder, [4,3];
+    push  @cOrder, [5,2];
+    push  @cOrder, [4,2];
+    push  @cOrder, [5,3];
+    push  @cOrder, [5,1];
+    push  @cOrder, [5,0];
+    push  @cOrder, [4,0];
+    push  @cOrder, [3,1];
+    push  @cOrder, [4,1];
+    push  @cOrder, [3,0];
+    push  @cOrder, [6,0];
+    push  @cOrder, [7,0];
+    push  @cOrder, [7,1];
+    push  @cOrder, [6,2];
+    push  @cOrder, [6,1];
+    push  @cOrder, [7,2];
+    push  @cOrder, [7,5];
+    push  @cOrder, [6,5];
+    push  @cOrder, [6,4];
+    push  @cOrder, [7,3];
+    push  @cOrder, [7,4];
+    push  @cOrder, [6,3];
+    push  @cOrder, [6,6];
+    push  @cOrder, [6,7];
+    push  @cOrder, [7,7];
+    push  @cOrder, [7,6]; 
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c5"}  = sprintf "-:G:D:-:01 %s LDS.64 j%dIx0, [readIs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx2, [readIs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx4, [readIs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx6, [readIs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy0, [readFs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy2, [readFs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy4, [readFs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy6, [readFs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $ctrl   = "-:-:-:-:00";
+
+            if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) {
+              $ins = "-:G:D:-:00 NOP;\n";   
+            }
+
+            if ($c > 60 && !$ins){
+              $ins = "-:-:D:-:07 NOP;\n";
+            }
+
+            # 04 and 05 are dual issued
+            if($ins) {
+              $ctrl = "-:-:D:-:04";
+            } else {
+              if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){
+                $ctrl = "-:-:D:-:04";
+              }
+              else{
+                $ctrl = "-:-:D:-:05";
+              }
+            }
+
+            $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 LDS.128 mpq, [RZ + addr_m];
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkI, SR_CTAID.Z;
+-:-:-:-:00 S2R blkF, SR_CTAID.Y;
+
+// tidOX = (tid & 7) << 2
+// tidOY = tid >> 3
+-:-:-:-:00 LOP.AND tidOX, tid,   7;
+-:-:-:-:00 SHL     tidOX, tidOX, 2;
+-:-:-:-:00 SHR.U32 tidOY, tid,   3;
+
+-:-:-:-:00 LOP.AND readIs, readIs, 0x7ff;
+-:-:-:-:00 LOP.AND readFs, readFs, 0x7ff;
+
+// Div by 4 here collapses k stride
+// writeCs = (readKs / 4) * 64 + readNs;
+-:-:-:-:00 ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs = 4 * (tidOX + (tidOY * 64))
+-:-:-:-:00 ISCADD readCs, tidOY, tidOX, 6;
+-:-:-:-:00 SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+-:-:-:-:00 ISCADD n, blkI, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*64 + tidOY * 4
+-:-:-:-:00 SHL    tidOY,   tidOY, 2;
+-:-:-:-:00 ISCADD k, blkF, tidOY, 6;
+
+// o = k*MPQN + m*PQN + p*QN + q*N + n
+-:-:-:-:00 IMAD      to, q, param_N,    n;
+-:-:-:-:00 IMAD.U32.U32 to, p, param_QN,   to;
+-:-:-:-:00 IMAD.U32.U32 to, m, param_PQN,  to;
+-:-:-:-:00 IMAD.U32.U32 to, k, param_MPQN, to;
+//-:-:-:-:00 LEA      Out0.CC, to, param_O[0],     2;
+//-:-:-:-:00 LEA.HI.X Out1,    to, param_O[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_O[0];
+-:-:-:-:00 MOV tmp_param1, param_O[1];
+-:-:-:-:00 SHL tmp_shl, to, 0x2;
+-:-:-:-:00 IADD Out0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X Out1, RZ, tmp_param1;
+
+-:-:-:-:00 MOV  MPQN,  param_MPQN;
+-:-:-:-:00 SHL  MPQN1, MPQN, 2;
+-:-:-:-:00 SHL  MPQN4, MPQN, 4;
+-:-:-:-:00 ISCADD MPQN28, MPQN, -MPQN4, 7;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, n, param_N, PT; // n +  0 < N
+-:-:-:-:00 IADD n, n, 32;
+-:-:-:-:00 ISETP.LT.AND P1, PT, n, param_N, PT; // n + 32 < N
+
+-:-:-:-:00 MOV alpha, param_alpha;
+
+-:-:-:-:00 BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        if ($y == 4)
+        {
+            $out .= sprintf(
+                "-:-:-:-:00 IADD Out0.CC, Out0, MPQN28;\n" .
+                "-:-:-:-:00 IADD k, k, 28;\n" .
+                "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n" .
+                "-:-:-:-:00 IADD.X Out1, Out1, RZ;\n\n",
+                ($y) x 8);
+        }
+        else
+        {
+            $out .= sprintf(
+                "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n\n",
+                ($y) x 8);
+        }
+
+        $out .= "-:-:-:-:00 CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 EXIT;
+
+STORE_C:
+
+-:-:-:-:00 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n +  0 < N
+-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n + 32 < N
+-:-:-:-:00 IADD k, k, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+-:-:-:-:00 STS.128 [writeCs + 4x<00>], cs0;
+-:-:-:-:00 STS.128 [writeCs + 4x<32>], cs4;
+-:-:-:-:00 LDS.128 cs0, [readCs + 4x<00>];
+-:-:-:-:00 LDS.128 cs4, [readCs + 4x<32>];
+
+// Store results back to global
+-:-:-:-:00 @P2 ST.E.128 [Out + 4x<00>], cs0;
+-:-:-:-:00 @P3 ST.E.128 [Out + 4x<32>], cs4;
+
+-:-:-:-:00 IADD   Out0.CC, Out0, MPQN1;
+-:-:-:-:00 IADD.X Out1,    Out1, RZ;
+
+-:-:-:-:00 RET;
+
diff --git a/Kernel/Convolution/Kepler/sconv_fprop.cu b/Kernel/Convolution/Kepler/sconv_fprop.cu
new file mode 100644
index 0000000..51f1979
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_fprop.cu
@@ -0,0 +1,221 @@
+#include "sconv.h"
+
+bool fprop_K64_N64(const float *I, const float *F, float *O,
+  unsigned int N, unsigned int C, unsigned int K,
+  unsigned int D, unsigned int H, unsigned int W,
+  unsigned int R, unsigned int S, unsigned int T,
+  unsigned int M, unsigned int P, unsigned int Q,
+  unsigned int str_d, unsigned int str_h, unsigned int str_w,
+  unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) {
+	std::string kernel_name = "sconv_fprop_K64_N64";
+  float alpha = 1.0f;
+  unsigned int WN, HWN, DHWN, KRST, RST, RS, PQ, QN, PQM, PQN, MPQN;
+  unsigned int magic_RS, magic_S;
+  unsigned int shift_RS, shift_S;
+  unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ;
+  // input
+  WN = W * N;
+  HWN = H * WN;
+  DHWN = D * HWN;
+  // filter
+  RS = R * S;
+  RST = T * RS;
+  KRST = K * RST;
+  // output
+  QN = Q * N;
+  PQ = P * Q;
+  PQM = PQ * M;
+  PQN = P * QN;
+  MPQN = M * PQN;
+  // magic numbers
+  magic32(PQ, Q, magic_Q, shift_Q);
+  magic32(PQM, PQ, magic_PQ, shift_PQ);
+  magic32(RST + 32, RS, magic_RS, shift_RS);
+  magic32(RS + 32, S, magic_S, shift_S);
+  // test param set up
+  float *test_param;
+  cudaError_t cuda_error;
+  cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024);
+  cudaMemset(test_param, 0, sizeof(float) * 1024);
+  // arguments
+  void *args[37] = {&test_param, &O, &I, &F, &alpha,
+    &N, &K, &D, &H, &W, &WN, &HWN, &DHWN,
+    &C, &KRST, &RST, &RS, &magic_RS, &shift_RS, &S, &magic_S, &shift_S,
+    &pad_d, &pad_h, &pad_w, &str_d, &str_h, &str_w,
+    &Q, &PQ, &QN, &PQN, &MPQN, &magic_Q, &shift_Q, &magic_PQ, &shift_PQ};
+  int gridMPQ = M * P * Q;
+  int gridX = gridMPQ;
+  int gridY = K / 64 + (K % 64 != 0);
+  int gridZ = N / 64 + (N % 64 != 0);
+  CUresult res = cuLaunchKernel(nervana_kernels[kernel_name],
+    gridX, gridY, gridZ, 64, 1, 1, 64 * 8 * 4 + RST * 4 * 2 + 8, 0, args, NULL);
+  if (res != CUDA_SUCCESS) {
+    std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl;
+    return false;
+  }
+  cuCtxSynchronize();
+  // output test_param
+  float* h_test = (float *)malloc(sizeof(float) * 64);
+  for (int i = 0; i < 64; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 64, cudaMemcpyDeviceToHost);
+  if (cuda_error != cudaSuccess) {
+    std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
+    exit(1);
+  }
+  for (int i = 0; i < 64; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  // free test_param
+  free(h_test);
+  return true;
+}
+
+bool fprop_K128_N128(const float *I, const float *F, float *O,
+  unsigned int N, unsigned int C, unsigned int K,
+  unsigned int D, unsigned int H, unsigned int W,
+  unsigned int R, unsigned int S, unsigned int T,
+  unsigned int M, unsigned int P, unsigned int Q,
+  unsigned int str_d, unsigned int str_h, unsigned int str_w,
+  unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) {
+	std::string kernel_name = "sconv_fprop_K128_N128";
+  float alpha = 1.0f;
+  unsigned int WN, HWN, DHWN, KRST, RST, RS, PQ, QN, PQM, PQN, MPQN;
+  unsigned int magic_RS, magic_S;
+  unsigned int shift_RS, shift_S;
+  unsigned int magic_Q, shift_Q, magic_PQ, shift_PQ;
+  // input
+  WN = W * N;
+  HWN = H * WN;
+  DHWN = D * HWN;
+  // filter
+  RS = R * S;
+  RST = T * RS;
+  KRST = K * RST;
+  // output
+  QN = Q * N;
+  PQ = P * Q;
+  PQM = PQ * M;
+  PQN = P * QN;
+  MPQN = M * PQN;
+  // magic numbers
+  magic32(PQ, Q, magic_Q, shift_Q);
+  magic32(PQM, PQ, magic_PQ, shift_PQ);
+  magic32(RST + 32, RS, magic_RS, shift_RS);
+  magic32(RS + 32, S, magic_S, shift_S);
+  // test param set up
+  float *test_param;
+  cudaError_t cuda_error;
+  cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024);
+  cudaMemset(test_param, 0, sizeof(float) * 1024);
+  // arguments
+  void *args[37] = {&test_param, &O, &I, &F, &alpha,
+    &N, &K, &D, &H, &W, &WN, &HWN, &DHWN,
+    &C, &KRST, &RST, &RS, &magic_RS, &shift_RS, &S, &magic_S, &shift_S,
+    &pad_d, &pad_h, &pad_w, &str_d, &str_h, &str_w,
+    &Q, &PQ, &QN, &PQN, &MPQN, &magic_Q, &shift_Q, &magic_PQ, &shift_PQ};
+  int gridMPQ = M * P * Q;
+  int gridX = gridMPQ;
+  int gridY = K / 128 + (K % 128 != 0);
+  int gridZ = N / 128 + (N % 128 != 0);
+  CUresult res = cuLaunchKernel(nervana_kernels[kernel_name],
+    gridX, gridY, gridZ, 256, 1, 1, 128 * 8 * 4 + RST * 4 * 2 + 8, 0, args, NULL);
+  if (res != CUDA_SUCCESS) {
+    std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl;
+    return false;
+  }
+  cuCtxSynchronize();
+  // output test_param
+  float* h_test = (float *)malloc(sizeof(float) * 128);
+  for (int i = 0; i < 128; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 128, cudaMemcpyDeviceToHost);
+  if (cuda_error != cudaSuccess) {
+    std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
+    exit(1);
+  }
+  for (int i = 0; i < 128; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  // free test_param
+  free(h_test);
+  return true;
+}
+
+int main() {
+  // init
+  cudaFree(0);
+  // params
+  float *d_I, *d_F, *d_O;
+  unsigned int N = 128, C = 1, K = 128, D = 1, H = 5, W = 5, T = 1, R = 5, S = 5;
+  unsigned int str_d = 1, str_h = 1, str_w = 1;
+  unsigned int pad_d = 0, pad_h = 0, pad_w = 0;
+  unsigned int M, P, Q;
+  cudaError_t cuda_error;
+  M = (D - T + 2 * pad_d) / str_d + 1;
+  P = (H - R + 2 * pad_h) / str_h + 1;
+  Q = (W - S + 2 * pad_w) / str_w + 1;
+  // host memory
+  float *h_I = (float *)malloc(C * D * H * W * N * sizeof(float));
+  for (int i = 0; i < C * D * H * W; ++i) {
+    for (int j = 0; j < N; ++j) {
+      h_I[i * N + j] = j;
+    }
+  }
+  float *h_F = (float *)malloc(C * R * S * T * K * sizeof(float));
+  for (int i = 0; i < C * R * S * T * K; ++i) {
+    h_F[i] = 1;
+  }
+  float* h_O = (float *)malloc(sizeof(float) * K * M * P * Q * N);
+  // device memory
+  cudaMalloc((void**)&d_I, sizeof(float) * C * D * H * W * N);
+  cudaMalloc((void**)&d_F, sizeof(float) * C * R * S * T * K);
+  cudaMalloc((void**)&d_O, sizeof(float) * K * M * P * Q * N);
+  // memcpy h_I, h_F
+  cudaMemcpy(d_I, h_I, sizeof(float) * C * D * H * W * N,
+    cudaMemcpyHostToDevice);
+  cudaMemcpy(d_F, h_F, sizeof(float) * C * R * S * T * K,
+    cudaMemcpyHostToDevice);
+  // load kernels 
+  if (!load_kernels("./")) {
+    std::cerr << "Couldn't load all kernels" << std::endl;
+    exit(1);
+  }
+  // launch kernel
+	if (K <= 64) {
+		if (!fprop_K64_N64(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) {
+			std::cerr << "Launch error" << std::endl;
+		}
+	} else {
+		if (!fprop_K128_N128(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) {
+			std::cerr << "Launch error" << std::endl;
+		}
+	}
+  // output
+  std::cout << "Result" << std::endl;
+  cuda_error = cudaMemcpy(h_O, d_O, sizeof(float) * K * M * P * Q * N, cudaMemcpyDeviceToHost);
+  if (cuda_error != cudaSuccess) {
+    std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
+    exit(1);
+  }
+  for (int i = 0; i < 100; ++i) {
+    std::cout << h_O[i] << " ";
+  }
+  std::cout << std::endl;
+  // free memory
+  free(h_O);
+  free(h_I);
+  free(h_F);
+  cudaFree(d_I);
+  cudaFree(d_F);
+  cudaFree(d_O);
+  // run successfully
+  std::cout << "finish" << std::endl;
+  return 0;
+}
diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.cu b/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.cu
new file mode 100644
index 0000000..91f813d
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.cu
@@ -0,0 +1,48 @@
+extern "C"
+__global__ void sconv_fprop_K128_N128 (
+	float* param_test,
+	float *param_O,
+	const float *param_I,
+	const float *param_F,
+	float param_alpha,
+	int param_N,
+	int param_K,
+	int param_D,
+	int param_H,
+	int param_W,
+	int param_WN,
+	int param_HWN,
+	int param_DHWN,
+	int param_C,
+	int param_KRST,
+	int param_RST,
+	int param_RS,
+	int param_magic_RS,
+	int param_shift_RS,
+	int param_S,
+	int param_magic_S,
+	int param_shift_S,
+	int param_pad_d,
+	int param_pad_h,
+	int param_pad_w,
+	int param_str_d,
+	int param_str_h,
+	int param_str_w,
+	int param_Q,
+	int param_PQ,
+	int param_QN,
+	int param_PQN,
+	int param_MPQN,
+	int param_magic_Q,
+	int param_shift_Q,
+	int param_magic_PQ,
+	int param_shift_PQ) {
+	__shared__ float share[128 * 8 * 4 + 8];
+
+	int tid = threadIdx.x;
+
+	share[tid] = 1;
+
+	*param_O = share[127-tid];
+	*param_test = share[127-tid];
+}
diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.sass b/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.sass
new file mode 100644
index 0000000..6582360
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.sass
@@ -0,0 +1,791 @@
+# Kernel: sconv_fprop_K128_N128
+// debug:
+// mode1
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//-:-:-:-:00 SHL tmp_shl, tid, 0x2;
+//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0;
+//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1;
+//-:-:-:-:00 I2F.F32.U32 rst, rst;
+//-:-:-:-:00 ST.E [tmp_param00], rst;
+//-:-:-:-:00 EXIT;
+
+// mode2
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//
+//-:-:-:-:00 MOV32I k, 0x40000000;
+//-:-:-:-:00 ST.E [tmp_param0], k;
+//-:-:-:-:00 EXIT;
+
+// modify steps:
+// XMAD->IMAD
+// shared memory addresses->RZ
+// LDG->LD
+// LEA->MOV, IADD, SHL
+// XMAD.LO2C->IMAD.U32.U32
+// XMAD.PSL->IMAD.U32.U32
+// VMAD->IMAD, IADD
+// MOV->MOV32I
+// IADD3->IADD, IADD
+// POPC
+// ST.CG->ST
+// control code
+// comments
+// LDS.U->LDS
+// register<0-7>->register<0-3>, register<4-7>
+// avoid register conflicts
+
+// optimization steps:
+// alexnet2
+// initial->1200
+// bank conflict->1288
+// alignment+dual issue+reuse->1600
+// half ldg.128->1700
+// all ldg.128->1777
+// control codes->1900
+// scheduling->1937
+// reduce unnecessary instructions->2100
+
+<CONSTANT_MAPPING>
+    szShareF  : (128*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<128*8*2 + 128*8*2 + 0>
+    addr_m     : 4x<128*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<128*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<128*8*2 + 128*8*2 + 6>
+    addr_szLut : 4x<128*8*2 + 128*8*2 + 7>
+    addr_lut   : 4x<128*8*2 + 128*8*2 + 8>
+
+    param_test[0]  : c[0x0][0x140]
+    param_test[1]  : c[0x0][0x144]
+    param_O[0]     : c[0x0][0x148]
+    param_O[1]     : c[0x0][0x14c]
+    param_I[0]     : c[0x0][0x150]
+    param_I[1]     : c[0x0][0x154]
+    param_F[0]     : c[0x0][0x158]
+    param_F[1]     : c[0x0][0x15c]
+    param_alpha    : c[0x0][0x160]
+    param_N        : c[0x0][0x164]
+    param_K        : c[0x0][0x168]
+    param_D        : c[0x0][0x16c]
+    param_H        : c[0x0][0x170]
+    param_W        : c[0x0][0x174]
+    param_WN       : c[0x0][0x178]
+    param_HWN      : c[0x0][0x17c]
+    param_DHWN     : c[0x0][0x180]
+    param_C        : c[0x0][0x184]
+    param_KRST     : c[0x0][0x188]
+    param_RST      : c[0x0][0x18c]
+    param_RS       : c[0x0][0x190]
+    param_magic_RS : c[0x0][0x194]
+    param_shift_RS : c[0x0][0x198]
+    param_S        : c[0x0][0x19c]
+    param_magic_S  : c[0x0][0x1a0]
+    param_shift_S  : c[0x0][0x1a4]
+    param_pad_d    : c[0x0][0x1a8]
+    param_pad_h    : c[0x0][0x1ac]
+    param_pad_w    : c[0x0][0x1b0]
+    param_str_d    : c[0x0][0x1b4]
+    param_str_h    : c[0x0][0x1b8]
+    param_str_w    : c[0x0][0x1bc]
+    param_Q        : c[0x0][0x1c0]
+    param_PQ       : c[0x0][0x1c4]
+    param_QN       : c[0x0][0x1c8]
+    param_PQN      : c[0x0][0x1cc]
+    param_MPQN     : c[0x0][0x1d0]
+    param_magic_Q  : c[0x0][0x1d4]
+    param_shift_Q  : c[0x0][0x1d8]
+    param_magic_PQ : c[0x0][0x1dc]
+    param_shift_PQ : c[0x0][0x1e0]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-67 : mpq<0-3>
+    64-67 : m, p, q, tidY
+    68-72 : blkF, blkI, blkMPQ, tid1, tidX
+    73-95 ~ pq, mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST
+
+    0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+      64-67 : j0Fy<0-3>
+      68-71 : j0Ix<0-3>
+      72-75 : j0Fy<4-7>
+      76-79 : j0Ix<4-7>
+      80-83 : j1Fy<0-3>
+      84-87 : j1Ix<0-3>
+      88-91 : j1Fy<4-7>
+      92-95 : j1Ix<4-7>
+
+      96-97 : trackI<0-1>
+      98-99 : trackF<0-1>
+
+    100-103 : loadI<0-3>
+    104-107 : loadF<0-3>
+    109 : readFs
+    108 : readIs
+    
+    110-114 ~ offsetIn, offsetFk, posCRST, lutSize, lutSizeRcp
+    115-120 ~ writeS, posCRSTf, channel, lutOffset, offsetI, offsetF
+    116-120 ~ tid128, tid, p_and
+    121 : tmp_shl
+
+    122-123 : sliceI, sliceF
+    122-123 : sliceIF<0-1>
+    124-125 ~ offsetIc, offsetFc
+    124-125 : tmp_param<0-1>
+    124-127 ~ addressF0, addressF1, addressI0, addressI1
+
+    72-79 : cs<0-7>
+    80-81 : Out<0-1>
+
+    82-120  ~ writeCs, readCs, alpha, tidOX, tidOX2, tidOY, to, k, n, MPQN1, MPQN60, MPQN, MPQN4
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,    SR_TID.X;
+-:-:-:-:00 S2R blkF,   SR_CTAID.Y; #K128
+-:-:-:-:00 S2R blkI,   SR_CTAID.Z; #N128
+-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; # m,p,q stored in x index
+
+// if tid > 32
+//    P0 = true
+-:-:-:-:00 ISETP.GE.AND P0, PT, tid, 32, PT;
+
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tid <= 127
+// tidX = (tid & 31) << 2
+// tidX = 0 : 4 : 128
+// tidY = tid >> 5
+// tidY = 0 : 1 : 7
+-:-:-:-:00 LOP.AND tidX, tid,  31;
+-:-:-:-:00 SHL     tidX, tidX, 2;
+-:-:-:-:00 SHR.U32 tidY, tid,  5;
+
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//-:-:-:-:00 SHL tmp_shl, tid, 0x2;
+//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0;
+//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1;
+//-:-:-:-:00 I2F.F32.U32 tidY, tidY;
+//-:-:-:-:00 ST.E [tmp_param00], tidY;
+//-:-:-:-:00 EXIT;
+
+// offsetFk += blkF * 128 + tidX
+// K128
+// blkF ---- trackF
+//      tidX
+-:-:-:-:00 ISCADD offsetFk, blkF, tidX, 7;
+
+// offsetIn += blkI * 64 + tidX
+// N128
+// blkI ---- trackI
+//      tidX
+-:-:-:-:00 ISCADD offsetIn, blkI, tidX, 7;
+
+// writeS = (128 * tidY + tidX) * 4
+// tidY = 0 : 1 : 7
+// tidX = 0 : 4 : 128
+// ----------------
+// ---------------- tidY 0 : 1 : 7
+// ---- writeS
+// tidX
+-:-:-:-:00 SHR tidX, tidX, 1; 
+-:-:-:-:00 ISCADD  writeS, tidY, tidX, 7;
+-:-:-:-:00 SHL     writeS, writeS, 2;
+
+// readFs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+// [6][5][4][0] * 8;
+-:-:-:-:00 LOP.AND tid1,   tid,    1;
+-:-:-:-:00 LOP.AND readFs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readFs, readFs, 3;
+-:-:-:-:00 LOP.OR  readFs, readFs, tid1;
+-:-:-:-:00 SHL     readFs, readFs, 3;
+
+// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7)
+// [3][2][1] * 16;
+-:-:-:-:00 LOP.AND tid128, tid, 128;
+-:-:-:-:00 SHR.U32 tid128, tid128, 3;
+-:-:-:-:00 BFE.U32 readIs, tid, 0x301; // 3 bits at position 1
+-:-:-:-:00 LOP.OR  readIs, readIs, tid128;
+-:-:-:-:00 ISCADD  readIs, readIs, 4x<szShareF>, 3;
+
+-:-:-:-:00 @P0 BRA.U END_SETUP;
+
+-:-:-:-:00 MOV rst,        tid;
+-:-:-:-:00 MOV lutStore2,  RZ;
+-:-:-:-:00 MOV lutSize,    RZ;
+-:-:-:-:00 MOV32I warp_count, 32;
+
+// m  = blkMPQ / PQ
+// pq = blkMPQ % PQ
+-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ;
+-:-:-:-:00 SHR.U32   m,  m,      param_shift_PQ;
+-:-:-:-:00 IMAD pq,  m,  param_PQ, RZ;
+-:-:-:-:00 IADD pq, -pq, blkMPQ;
+// p = pq / Q
+// q = pq % Q
+-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ;
+-:-:-:-:00 SHR.U32   p, p,  param_shift_Q;
+-:-:-:-:00 IMAD  q,  p, param_Q, RZ;
+-:-:-:-:00 IADD  q, -q, pq;
+
+// dep_thd_mask = -1
+-:-:-:-:00 MOV32I dep_thd_mask, -1;
+
+// if p is odd
+// set q = param_Q - q - 1
+// if p is even
+// q = q
+-:-:-:-:00 LOP.AND p_and, p, 1;
+-:-:-:-:00 ISETP.NE.AND P1, PT, p_and, RZ, PT;
+-:-:-:-:00 @P1 IADD q, -q, param_Q;
+-:-:-:-:00 @P1 IADD q, q, dep_thd_mask;
+
+-:-:-:-:00 STS.128 [RZ + addr_m], m;
+
+// mt = m * w - pad_d
+// pr = p * u - pad_h
+// qs = q * v - pad_w
+-:-:-:-:00 IMAD qs, q,   param_str_w, RZ;
+-:-:-:-:00 IMAD pr, p,   param_str_h, RZ;
+-:-:-:-:00 IMAD mt, m,   param_str_d, RZ;
+-:-:-:-:00 IADD qs, qs, -param_pad_w;
+-:-:-:-:00 IADD pr, pr, -param_pad_h;
+-:-:-:-:00 IADD mt, mt, -param_pad_d;
+
+// mask_shr = 32 - tid
+// dep_thd_mask = dep_thd_mask >> mask_shr
+-:-:-:-:00 IADD    mask_shr, -tid, 32;
+-:-:-:-:00 SHR.U32 dep_thd_mask, dep_thd_mask, mask_shr;
+
+LUT_LOOP:
+
+// warp synchronous loop while warp_count < RST
+-:-:-:-:00 ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
+-:-:-:-:00 IADD warp_count, warp_count, 32;
+// t =  rst / RS
+// rs = rst % RS
+-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ;
+-:-:-:-:00 SHR.U32   t, t, param_shift_RS;
+-:-:-:-:00 IMAD  rs, t, param_RS, RZ;
+-:-:-:-:00 IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ;
+-:-:-:-:00 SHR.U32   r, r, param_shift_S;
+-:-:-:-:00 IMAD   s, r, param_S, RZ;
+-:-:-:-:00 IADD   s, -s, rs;
+// x = qs + s
+// y = pr + r
+// z = mt + t
+-:-:-:-:00 IADD x, qs, s;
+-:-:-:-:00 IADD y, pr, r;
+-:-:-:-:00 IADD z, mt, t;
+-:-:-:-:00 ISETP.GE.AND  P4, PT, x, RZ, PT;
+-:-:-:-:00 ISETP.GE.AND  P5, PT, y, RZ, PT;
+-:-:-:-:00 ISETP.GE.AND  P6, PT, z, RZ, PT;
+-:-:-:-:00 ISETP.LT.AND  P4, PT, x, param_W, P4;
+-:-:-:-:00 ISETP.LT.AND  P5, PT, y, param_H, P5;
+-:-:-:-:00 ISETP.LT.AND  P6, PT, z, param_D, P6;
+-:-:-:-:00 PSETP.AND.AND P1, PT, P4, P5, P6;
+
+// sliceI = z*HWN + y*WN + x*N
+// rst                    N
+// --------------         --------------
+// --------------         --------------
+// --------------         --------------
+// -------------- K * rst --------------
+// --------------         --------------
+// --------------         --------------
+-:-:-:-:00 IMAD sliceI, x, param_N,   RZ;
+-:-:-:-:00 IMAD.U32.U32 sliceI, y, param_WN, sliceI;
+-:-:-:-:00 IMAD.U32.U32 sliceI, z, param_HWN, sliceI;
+
+// sliceF = rst * K
+-:-:-:-:00 IMAD sliceF, rst, param_K, RZ;
+
+// Get a mask of all valid slices in the warp
+-:-:-:-:00 VOTE.ANY ballot, PT, P1;
+// Count the total valid slices
+-:-:-:-:00 POPC warp_slices, ballot, ballot;
+// Prepare lutStore for this and next loop
+// lutStore = lutStore2
+// lutStore2 = warp_slices * 8 + lutStore2
+-:-:-:-:00 @P1 MOV lutStore, lutStore2;
+-:-:-:-:00 ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+// bit(dep_thd_mask) = tid
+// bit(ballot) = valid tid
+// dep_thd_cnt = number of bit below ballot
+-:-:-:-:00 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+-:-:-:-:00 @P1 POPC dep_thd_cnt, dep_thd_bits, dep_thd_bits;
+// use the rst increment to space the barrier sync
+// rst = rst + 32
+-:-:-:-:00 IADD rst, rst, 32;
+// Update the lutStore address from this count
+// lutStore = dep_thd_cnt * 8 + lutStore
+-:-:-:-:00 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+-:-:-:-:00 @P1 STS.64 [lutStore + addr_lut], sliceIF;
+
+// Keep track of the total size of the lut
+// lutSize = lutSize + warp_slices
+-:-:-:-:00 IADD lutSize, lutSize, warp_slices;
+
+-:-:-:-:00 @P0 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+-:-:-:-:00 STS [RZ + addr_szLut], lutSize;
+
+// if tid >= 32, directly enter it
+END_SETUP:
+
+-:-:-:-:00 BAR.SYNC 0;
+
+// Grab the caclulated lut size and get it's reciprical
+// Get the total reduction depth
+-:-:-:-:00 LDS lutSize, [RZ + addr_szLut];
+
+// endCRST = lutSize * param_C (channel)
+-:-:-:-:00 IMAD endCRST, lutSize, param_C, RZ;
+// lutSizeRcp = 1 / lutSize
+-:-:-:-:00 I2F.F32.S32 lutSizeRcp, lutSize;
+-:-:-:-:00 MUFU.RCP lutSizeRcp, lutSizeRcp;
+
+// posCRST = endCRST - tidY - 1
+-:-:-:-:00 IADD posCRST, endCRST, -1;
+-:-:-:-:00 IADD posCRST, posCRST, -tidY;
+
+// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 8 then make a full 8 line fetch.
+-:-:-:-:00 LOP.AND partial, endCRST, 7;
+-:-:-:-:00 ISETP.EQ.AND P1, PT, RZ, partial, PT;
+// If partial == 0
+-:-:-:-:00 @P1 MOV32I partial, 8;
+// channel = lower(posCRST / lutSize)
+// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
+-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST;
+-:-:-:-:00 FMUL channel, posCRSTf, lutSizeRcp;
+-:-:-:-:00 FFMA channel, channel, 5.9604644775390625e-08, channel;
+-:-:-:-:00 F2I.S32.F32.TRUNC channel, channel;
+
+// lutOffset = (posCRST % lutSize) * 8
+-:-:-:-:00 IMAD tmp_param0, channel, lutSize, RZ;
+-:-:-:-:00 IADD lutOffset, -tmp_param0, posCRST;
+-:-:-:-:00 SHL lutOffset, lutOffset, 3;
+// P1 = tidY < partial
+-:-:-:-:00 ISETP.LT.AND P1, PT, tidY, partial, PT;
+
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+-:-:-:-:00 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ;
+-:-:-:-:00 IMAD offsetFc, channel, param_KRST, RZ;
+// posCRST -= partial
+-:-:-:-:00 IADD posCRST, posCRST, -partial;
+-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut];
+
+// trackF = offsetFK + offsetFC + sliceF + param_F
+-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc;
+-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF;
+// trackI = offsetIN + offsetIC + sliceI + param_I
+-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc;
+-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI;
+
+//-:-:-:-:00 @P1 LEA      trackF0.CC, offsetF, param_F[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_F[0];
+-:-:-:-:00 MOV tmp_param1, param_F[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2;
+-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 @P1 IADD.X trackF1, RZ, tmp_param1;
+//-:-:-:-:00 @P1 LEA      trackI0.CC, offsetI, param_I[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_I[0];
+-:-:-:-:00 MOV tmp_param1, param_I[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2;
+-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 @P1 IADD.X trackI1, RZ, tmp_param1;
+
+-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF];
+-:-:-:-:00 @!P1 LDS.128 loadF0, [RZ + addr_zero];
+
+-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI];
+-:-:-:-:00 @!P1 LDS.128 loadI0, [RZ + addr_zero];
+
+-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT;
+
+-:-:-:-:00 STS.64 [writeS], loadF0;
+-:-:-:-:00 STS.64 [writeS + 4x<64>], loadF2;
+-:-:-:-:00 STS.64 [writeS + 4x<szShareF>], loadI0;
+-:-:-:-:00 STS.64 [writeS + 4x<szShareF+64>], loadI2;
+
+-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST;
+
+-:-:-:-:00 BAR.SYNC 0;
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;
+
+-:-:-:-:00 LDS.64 j0Ix0, [readIs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS.64 j0Ix2, [readIs + 4x<0*128 + 64>];
+-:-:-:-:00 LDS.64 j0Fy0, [readFs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS.64 j0Fy2, [readFs + 4x<0*128 + 64>];
+
+-:-:-:-:00 LDS.64 j0Ix4, [readIs + 4x<0*128 + 16>];
+-:-:-:-:00 LDS.64 j0Ix6, [readIs + 4x<0*128 + 80>];
+-:-:-:-:00 LDS.64 j0Fy4, [readFs + 4x<0*128 + 32>];
+-:-:-:-:00 LDS.64 j0Fy6, [readFs + 4x<0*128 + 96>];
+
+// channel = posCRST / lutSize
+-:-:-:-:00 @P1 FMUL channel, posCRSTf, lutSizeRcp;
+-:-:-:-:00 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel;
+-:-:-:-:00 @P1 F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+-:-:-:-:00 @P1 IMAD tmp_param0, channel, lutSize, RZ;
+-:-:-:-:00 @P1 IADD lutOffset, -tmp_param0, posCRST;
+-:-:-:-:00 @P1 SHL lutOffset, lutOffset, 3;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+-:-:-:-:00 @P1 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ;
+-:-:-:-:00 @P1 IMAD      offsetFc, channel, param_KRST, RZ;
+
+-:-:-:-:00 IADD posCRST, posCRST, -8;
+-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut];
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+// trackF = offsetFK + offsetFC + sliceF + param_F
+-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc;
+-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF;
+-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc;
+-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI;
+
+//-:-:-:-:00 @P1 LEA      trackF0.CC, offsetF, param_F[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_F[0];
+-:-:-:-:00 MOV tmp_param1, param_F[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2;
+-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 @P1 IADD.X trackF1, RZ, tmp_param1;
+
+//-:-:-:-:00 @P1 LEA      trackI0.CC, offsetI, param_I[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_I[0];
+-:-:-:-:00 MOV tmp_param1, param_I[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2;
+-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 @P1 IADD.X trackI1, RZ, tmp_param1;
+
+-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF];
+-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI];
+
+-:-:-:-:00 MOV addressF0, param_F[0];
+-:-:-:-:00 MOV addressF1, param_F[1];
+-:-:-:-:00 MOV addressI0, param_I[0];
+-:-:-:-:00 MOV addressI1, param_I[1];
+
+LOOP:
+
+<CODE>
+    my %insert =
+    (
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c53 => "-:-:-:-:00 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+        #warps
+        j0c62 => "-:-:-:-:00 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+        #2
+        j0c63 => "-:-:-:-:00 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+
+        j1c47 => "-:-:D:-:05 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j1c63 => "-:-:-:-:00 \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j2c47 => "-:-:-:-:00 \@P1 IMAD lutOffset, -channel, lutSize, posCRST;\n",
+        j2c53 => "-:-:-:-:00 \@P1 IMAD offsetF, channel, param_KRST, offsetFk;\n",
+        j2c61 => "-:-:-:-:00 \@P1 IMAD offsetI, channel, param_DHWN, offsetIn;\n",
+        j2c62 => "-:-:-:-:00 \@P1 SHL lutOffset, lutOffset, 3;\n",
+        j2c63 => "-:-:-:-:00 IADD posCRST, posCRST, -8;\n",
+
+        j3c47 => "-:-:-:-:00 \@P1 LDS.64 sliceIF, [lutOffset + addr_lut];\n",
+        j3c53 => "-:-:-:-:00 TEXDEPBAR 0x0;\n",
+        j3c61 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<szShareF>], loadI0;\n",
+        j3c62 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<szShareF + 64>], loadI2;\n",
+        j3c63 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<0>], loadF0;\n",
+
+        j4c47 => "-:-:D:S:00 \@P0 STS.64 [writeS + 4x<64>], loadF2;\n",
+        j4c53 => "-:-:-:-:00 \@P1 IADD offsetF, offsetF, sliceF;\n",
+        #5
+        j4c61 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetF, 0x2;\n",
+        j4c62 => "-:-:-:-:00 \@P1 IADD offsetI, offsetI, sliceI;\n",
+        j4c63 => "-:-:-:-:00 \@P1 IADD trackF0.CC, tmp_shl, addressF0;\n",
+
+        j5c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetI, 0x2;\n",
+        j5c53 => "-:-:-:-:00 \@P1 IADD trackI0.CC, tmp_shl, addressI0;\n",
+        j5c61 => "-:-:-:-:00 \@P1 IADD.X trackF1, RZ, addressF1;\n",
+        j5c62 => "-:-:-:-:00 \@P1 IADD.X trackI1, RZ, addressI1;\n",
+
+        j6c47 => "-:G:D:-:00 \@P1 LDG.E.128 loadI0, [trackI];\n",
+        j6c53 => "-:G:D:-:00 \@P1 LDG.E.128 loadF0, [trackF];\n",
+        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readIs, readIs, 4x<szShareF + szShareI>;\n",
+        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readFs, readFs, 4x<szShareF + szShareI>;\n",
+        j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n",
+
+        j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;\n",
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n",
+    );
+
+    my @cOrder;
+
+    push  @cOrder, [0,0];
+    push  @cOrder, [0,1];
+    push  @cOrder, [1,1];
+    push  @cOrder, [2,0];
+    push  @cOrder, [1,0];
+    push  @cOrder, [2,1];
+    push  @cOrder, [2,3];
+    push  @cOrder, [2,2];
+    push  @cOrder, [1,2];
+    push  @cOrder, [0,3];
+    push  @cOrder, [1,3];
+    push  @cOrder, [0,2];
+    push  @cOrder, [0,4];
+    push  @cOrder, [0,5];
+    push  @cOrder, [1,5];
+    push  @cOrder, [2,4];
+    push  @cOrder, [1,4];
+    push  @cOrder, [2,5];
+    push  @cOrder, [2,7];
+    push  @cOrder, [2,6];
+    push  @cOrder, [1,6];
+    push  @cOrder, [0,7];
+    push  @cOrder, [1,7];
+    push  @cOrder, [0,6];
+    push  @cOrder, [3,6];
+    push  @cOrder, [3,7];
+    push  @cOrder, [4,7];
+    push  @cOrder, [5,6];
+    push  @cOrder, [4,6];
+    push  @cOrder, [5,7];
+    push  @cOrder, [5,5];
+    push  @cOrder, [5,4];
+    push  @cOrder, [4,4];
+    push  @cOrder, [3,5];
+    push  @cOrder, [4,5];
+    push  @cOrder, [3,4];
+    push  @cOrder, [3,2];
+    push  @cOrder, [3,3];
+    push  @cOrder, [4,3];
+    push  @cOrder, [5,2];
+    push  @cOrder, [4,2];
+    push  @cOrder, [5,3];
+    push  @cOrder, [5,1];
+    push  @cOrder, [5,0];
+    push  @cOrder, [4,0];
+    push  @cOrder, [3,1];
+    push  @cOrder, [4,1];
+    push  @cOrder, [3,0];
+    push  @cOrder, [6,0];
+    push  @cOrder, [7,0];
+    push  @cOrder, [7,1];
+    push  @cOrder, [6,2];
+    push  @cOrder, [6,1];
+    push  @cOrder, [7,2];
+    push  @cOrder, [7,5];
+    push  @cOrder, [6,5];
+    push  @cOrder, [6,4];
+    push  @cOrder, [7,3];
+    push  @cOrder, [7,4];
+    push  @cOrder, [6,3];
+    push  @cOrder, [6,6];
+    push  @cOrder, [6,7];
+    push  @cOrder, [7,7];
+    push  @cOrder, [7,6]; 
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c5"}  = sprintf "-:G:D:-:00 %s LDS.64 j%dIx0, [readIs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c11"} = sprintf "-:G:D:-:00 %s LDS.64 j%dIx2, [readIs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c17"} = sprintf "-:G:D:-:00 %s LDS.64 j%dIx4, [readIs + 4x<%d*128 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c59"} = sprintf "-:G:D:-:00 %s LDS.64 j%dIx6, [readIs + 4x<%d*128 + 80>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c23"} = sprintf "-:G:D:-:00 %s LDS.64 j%dFy0, [readFs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c29"} = sprintf "-:G:D:-:00 %s LDS.64 j%dFy2, [readFs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c35"} = sprintf "-:G:D:-:00 %s LDS.64 j%dFy4, [readFs + 4x<%d*128 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c41"} = sprintf "-:G:D:-:00 %s LDS.64 j%dFy6, [readFs + 4x<%d*128 + 96>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $ctrl   = "-:-:-:-:00";
+
+            if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) {
+              $ins = "-:G:D:-:00 NOP;\n";   
+            }
+
+            if ($c > 60 && !$ins){
+              $ins = "-:-:D:-:07 NOP;\n";
+            }
+
+            # 04 and 05 are dual issued
+            if($ins) {
+              $ctrl = "-:-:D:-:04";
+            } else {
+              if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){
+                $ctrl = "-:-:D:-:04";
+              }
+              else{
+                $ctrl = "-:-:D:-:05";
+              }
+            }
+
+            $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 LDS.128 mpq, [RZ + addr_m];
+-:-:-:-:00 S2R tid,  SR_TID.X; // 0-127
+-:-:-:-:00 S2R blkI, SR_CTAID.Z; // N id
+-:-:-:-:00 S2R blkF, SR_CTAID.Y; // K id
+
+// tidOX = (tid & 7) << 2 + (tid & 128) >> 1
+// tidOY = (tid & 127) >> 3
+-:-:-:-:00 LOP.AND tidOX,  tid,    7;
+-:-:-:-:00 SHL     tidOX,  tidOX,  2;
+-:-:-:-:00 LOP.AND tidOX2, tid,    128;
+-:-:-:-:00 SHR.U32 tidOX2, tidOX2, 1;
+-:-:-:-:00 LOP.OR  tidOX,  tidOX,  tidOX2;
+-:-:-:-:00 LOP.AND tidOY,  tid,    127;
+-:-:-:-:00 SHR.U32 tidOY,  tidOY,  3;
+
+-:-:-:-:00 SHL readFs, readFs, 1;
+-:-:-:-:00 SHL readIs, readIs, 1;
+-:-:-:-:00 LOP.AND readIs, readIs, 0x1ff;
+-:-:-:-:00 LOP.AND readFs, readFs, 0x0ff;
+
+// Div by 4 here collapses k stride
+// writeCs = readFs * 16 + readIs;
+-:-:-:-:00 ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+-:-:-:-:00 ISCADD readCs, tidOY, tidOX, 7;
+-:-:-:-:00 SHL    readCs, readCs, 2;
+
+// n = blkI * 128 + tidOX;
+-:-:-:-:00 ISCADD n, blkI, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF * 128 + tidOY * 4
+-:-:-:-:00 SHL    tidOY,   tidOY, 2;
+-:-:-:-:00 ISCADD k, blkF, tidOY, 7;
+
+// o = k*MPQN + m*PQN + p*QN + q*N + n
+-:-:-:-:00 IMAD      to, q, param_N,    n;
+-:-:-:-:00 IMAD.U32.U32 to, p, param_QN,   to;
+-:-:-:-:00 IMAD.U32.U32 to, m, param_PQN,  to;
+-:-:-:-:00 IMAD.U32.U32 to, k, param_MPQN, to;
+-:-:-:-:00 MOV tmp_param0, param_O[0];
+-:-:-:-:00 MOV tmp_param1, param_O[1];
+-:-:-:-:00 SHL tmp_shl, to, 0x2;
+-:-:-:-:00 IADD Out0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X Out1, RZ, tmp_param1;
+
+-:-:-:-:00 MOV  MPQN,  param_MPQN;
+-:-:-:-:00 SHL  MPQN1, MPQN, 2;
+-:-:-:-:00 SHL  MPQN4, MPQN, 4;
+-:-:-:-:00 ISCADD MPQN60, MPQN, -MPQN4, 8;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, n, param_N, PT; // n +  0 < N
+-:-:-:-:00 IADD n, n, 32;
+-:-:-:-:00 ISETP.LT.AND P1, PT, n, param_N, PT; // n + 32 < N
+
+-:-:-:-:00 MOV alpha, param_alpha;
+
+-:-:-:-:00 BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        if ($y == 4)
+        {
+            $out .= sprintf(
+                "-:-:-:-:00 IADD Out0.CC, Out0, MPQN60;\n" .
+                "-:-:-:-:00 IADD k, k, 60;\n" .
+                "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n" .
+                "-:-:-:-:00 IADD.X Out1, Out1, RZ;\n\n",
+                ($y) x 8);
+        }
+        else
+        {
+            $out .= sprintf(
+                "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n\n",
+                ($y) x 8);
+        }
+
+        $out .= "-:-:-:-:00 CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 EXIT;
+
+STORE_C:
+
+-:-:-:-:00 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n +  0 < N
+-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n + 32 < N
+-:-:-:-:00 IADD k, k, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+-:-:-:-:00 STS.128 [writeCs + 4x<00>], cs0;
+-:-:-:-:00 STS.128 [writeCs + 4x<32>], cs4;
+-:-:-:-:00 LDS.128 cs0, [readCs + 4x<00>];
+-:-:-:-:00 LDS.128 cs4, [readCs + 4x<32>];
+
+// Store results back to global
+-:-:-:-:00 @P2 ST.E.128 [Out + 4x<00>], cs0;
+-:-:-:-:00 @P3 ST.E.128 [Out + 4x<32>], cs4;
+
+-:-:-:-:00 IADD   Out0.CC, Out0, MPQN1;
+-:-:-:-:00 IADD.X Out1,    Out1, RZ;
+
+-:-:-:-:00 RET;
+
diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.cu b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.cu
new file mode 100644
index 0000000..ebfa963
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.cu
@@ -0,0 +1,48 @@
+extern "C"
+__global__ void sconv_fprop_K64_N64 (
+    float* param_test,
+    float *param_O,
+    const float *param_I,
+    const float *param_F,
+    float param_alpha,
+    int param_N,
+    int param_K,
+    int param_D,
+    int param_H,
+    int param_W,
+    int param_WN,
+    int param_HWN,
+    int param_DHWN,
+    int param_C,
+    int param_KRST,
+    int param_RST,
+    int param_RS,
+    int param_magic_RS,
+    int param_shift_RS,
+    int param_S,
+    int param_magic_S,
+    int param_shift_S,
+    int param_pad_d,
+    int param_pad_h,
+    int param_pad_w,
+    int param_str_d,
+    int param_str_h,
+    int param_str_w,
+    int param_Q,
+    int param_PQ,
+    int param_QN,
+    int param_PQN,
+    int param_MPQN,
+    int param_magic_Q,
+    int param_shift_Q,
+    int param_magic_PQ,
+    int param_shift_PQ) {
+      __shared__ float share[64 * 8 * 4 + 8];
+
+      int tid = threadIdx.x;
+
+      share[tid] = 1;
+
+      *param_O = share[63-tid];
+      *param_test = share[63-tid];
+    }
diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.sass b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.sass
new file mode 100644
index 0000000..8db0438
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.sass
@@ -0,0 +1,782 @@
+# Kernel: sconv_fprop_K64_N64
+// debug:
+// mode1
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//-:-:-:-:00 SHL tmp_shl, tid, 0x2;
+//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0;
+//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1;
+//-:-:-:-:00 I2F.F32.U32 rst, rst;
+//-:-:-:-:00 ST.E [tmp_param00], rst;
+//-:-:-:-:00 EXIT;
+
+// mode2
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//
+//-:-:-:-:00 MOV32I k, 0x40000000;
+//-:-:-:-:00 ST.E [tmp_param0], k;
+//-:-:-:-:00 EXIT;
+
+// modify steps:
+// XMAD->IMAD
+// shared memory addresses->RZ
+// LDG->LD
+// LEA->MOV, IADD, SHL
+// XMAD.LO2C->IMAD.U32.U32
+// XMAD.PSL->IMAD.U32.U32
+// VMAD->IMAD, IADD
+// MOV->MOV32I
+// IADD3->IADD, IADD
+// POPC
+// ST.CG->ST
+// control code
+// comments
+// LDS.U->LDS
+// register<0-7>->register<0-3>, register<4-7>
+// avoid register conflicts
+
+// optimization steps:
+// alexnet2
+// initial->1200
+// bank conflict->1288
+// alignment+dual issue+reuse->1600
+// half ldg.128->1700
+// all ldg.128->1777
+// control codes->1900
+// scheduling->1937
+// reduce unnecessary instructions->2100
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<64*8*4 + 0>
+    addr_m     : 4x<64*8*4 + 4>
+    addr_p     : 4x<64*8*4 + 5>
+    addr_q     : 4x<64*8*4 + 6>
+    addr_szLut : 4x<64*8*4 + 7>
+    addr_lut   : 4x<64*8*4 + 8>
+
+    param_test[0]  : c[0x0][0x140]
+    param_test[1]  : c[0x0][0x144]
+    param_O[0]     : c[0x0][0x148]
+    param_O[1]     : c[0x0][0x14c]
+    param_I[0]     : c[0x0][0x150]
+    param_I[1]     : c[0x0][0x154]
+    param_F[0]     : c[0x0][0x158]
+    param_F[1]     : c[0x0][0x15c]
+    param_alpha    : c[0x0][0x160]
+    param_N        : c[0x0][0x164]
+    param_K        : c[0x0][0x168]
+    param_D        : c[0x0][0x16c]
+    param_H        : c[0x0][0x170]
+    param_W        : c[0x0][0x174]
+    param_WN       : c[0x0][0x178]
+    param_HWN      : c[0x0][0x17c]
+    param_DHWN     : c[0x0][0x180]
+    param_C        : c[0x0][0x184]
+    param_KRST     : c[0x0][0x188]
+    param_RST      : c[0x0][0x18c]
+    param_RS       : c[0x0][0x190]
+    param_magic_RS : c[0x0][0x194]
+    param_shift_RS : c[0x0][0x198]
+    param_S        : c[0x0][0x19c]
+    param_magic_S  : c[0x0][0x1a0]
+    param_shift_S  : c[0x0][0x1a4]
+    param_pad_d    : c[0x0][0x1a8]
+    param_pad_h    : c[0x0][0x1ac]
+    param_pad_w    : c[0x0][0x1b0]
+    param_str_d    : c[0x0][0x1b4]
+    param_str_h    : c[0x0][0x1b8]
+    param_str_w    : c[0x0][0x1bc]
+    param_Q        : c[0x0][0x1c0]
+    param_PQ       : c[0x0][0x1c4]
+    param_QN       : c[0x0][0x1c8]
+    param_PQN      : c[0x0][0x1cc]
+    param_MPQN     : c[0x0][0x1d0]
+    param_magic_Q  : c[0x0][0x1d4]
+    param_shift_Q  : c[0x0][0x1d8]
+    param_magic_PQ : c[0x0][0x1dc]
+    param_shift_PQ : c[0x0][0x1e0]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-67 : mpq<0-3>
+    64-67 : m, p, q, tidY
+    68-72 : blkF, blkI, blkMPQ, tid1, tidX
+    73-95 ~ pq, mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST
+
+    0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+      64-67 : j0Fy<0-3>
+      68-71 : j0Ix<0-3>
+      72-75 : j0Fy<4-7>
+      76-79 : j0Ix<4-7>
+      80-83 : j1Fy<0-3>
+      84-87 : j1Ix<0-3>
+      88-91 : j1Fy<4-7>
+      92-95 : j1Ix<4-7>
+
+      96-97 : trackI<0-1>
+      98-99 : trackF<0-1>
+
+    100-103 : loadI<0-3>
+    104-107 : loadF<0-3>
+    108-111 : loadI<4-7>
+    112-115 : loadF<4-7>
+
+    117 : readFs
+    116 : readIs
+    118-127 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI
+
+    128 : tmp_shl
+    129 : tmp_data
+
+    130-131 : tmp_param<0-1>
+    132 : p_and
+    133 : tid
+    134-135 : sliceI, sliceF
+    134-135 : sliceIF<0-1>
+    136-139 ~ offsetF, offsetIc, offsetFc
+    140-143 ~ addressF0, addressF1, addressI0, addressI1
+    144-145 : tmp_param0<0-1>
+
+    72-79 : cs<0-7>
+    80-81 : Out<0-1>
+
+    82-125  ~ writeCs, readCs, alpha, tidOX, tidOY, to, k, n, MPQN1, MPQN28, MPQN, MPQN4
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,    SR_TID.X;
+-:-:-:-:00 S2R blkF,   SR_CTAID.Y; #K64
+-:-:-:-:00 S2R blkI,   SR_CTAID.Z; #N64
+-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; # m,p,q stored in x index
+
+// if tid > 32
+//    P0 = true
+-:-:-:-:00 ISETP.GE.AND P0, PT, tid, 32, PT;
+
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tid <= 63
+// tidX = (tid & 7) << 2
+// tidX = 0 : 8 : 255
+// tidY = tid >> 3
+// tidY = 0 : 1 : 7
+-:-:-:-:00 LOP.AND tidX, tid,  7;
+-:-:-:-:00 SHL     tidX, tidX, 2;
+-:-:-:-:00 SHR.U32 tidY, tid,  3;
+
+// offsetFk += blkF * 64 + tidX
+// K64
+// blkF ---- trackF
+//      tidX
+-:-:-:-:00 ISCADD  offsetFk, blkF, tidX, 6;
+
+// offsetIn += blkI * 64 + tidX
+// N64
+// blkI ---- trackI
+//      tidX
+-:-:-:-:00 ISCADD  offsetIn, blkI, tidX, 6;
+
+// writeS = (64 * tidY + tidX) * 4
+// tidY = 0 : 1 : 7
+// tidX = 0 : 8 : 255
+// ----------------
+// ---------------- tidY 0 : 1 : 8
+// ---- writeS
+// tidX
+-:-:-:-:00 ISCADD  writeS, tidY, tidX, 6;
+-:-:-:-:00 SHL     writeS, writeS, 2;
+
+// readFs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4;
+// [0][5][4] * 16;
+-:-:-:-:00 LOP.AND tid1,   tid,    1;
+-:-:-:-:00 LOP.AND readFs, tid,    0x30;
+-:-:-:-:00 SHR.U32 readFs, readFs, 3;
+-:-:-:-:00 LOP.OR  readFs, readFs, tid1;
+-:-:-:-:00 SHL     readFs, readFs, 4;
+
+// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+// [3][2][1] * 4 + 512 * 4;
+-:-:-:-:00 BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 ISCADD  readIs, readIs, 4x<8*64>, 4;
+-:-:-:-:00 MOV32I tmp_data, 128;
+
+-:-:-:-:00 @P0 BRA.U END_SETUP;
+
+-:-:-:-:00 MOV rst,        tid;
+-:-:-:-:00 MOV lutStore2,  RZ;
+-:-:-:-:00 MOV lutSize,    RZ;
+-:-:-:-:00 MOV32I warp_count, 32;
+
+// m  = blkMPQ / PQ
+// pq = blkMPQ % PQ
+-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ;
+-:-:-:-:00 SHR.U32   m,  m,      param_shift_PQ;
+-:-:-:-:00 IMAD pq,  m,  param_PQ, RZ;
+-:-:-:-:00 IADD pq, -pq, blkMPQ;
+// p = pq / Q
+// q = pq % Q
+-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ;
+-:-:-:-:00 SHR.U32   p, p,  param_shift_Q;
+-:-:-:-:00 IMAD  q,  p, param_Q, RZ;
+-:-:-:-:00 IADD  q, -q, pq;
+
+// dep_thd_mask = -1
+-:-:-:-:00 MOV32I dep_thd_mask, -1;
+
+// if p is odd
+// set q = param_Q - q - 1
+// if p is even
+// q = q
+-:-:-:-:00 LOP.AND p_and, p, 1;
+-:-:-:-:00 ISETP.NE.AND P1, PT, p_and, RZ, PT;
+-:-:-:-:00 @P1 IADD q, -q, param_Q;
+-:-:-:-:00 @P1 IADD q, q, dep_thd_mask;
+
+-:-:-:-:00 STS.128 [RZ + addr_m], m;
+
+// mt = m * w - pad_d
+// pr = p * u - pad_h
+// qs = q * v - pad_w
+-:-:-:-:00 IMAD qs, q,   param_str_w, RZ;
+-:-:-:-:00 IMAD pr, p,   param_str_h, RZ;
+-:-:-:-:00 IMAD mt, m,   param_str_d, RZ;
+-:-:-:-:00 IADD qs, qs, -param_pad_w;
+-:-:-:-:00 IADD pr, pr, -param_pad_h;
+-:-:-:-:00 IADD mt, mt, -param_pad_d;
+
+// mask_shr = 32 - tid
+// dep_thd_mask = dep_thd_mask >> mask_shr
+-:-:-:-:00 IADD    mask_shr, -tid, 32;
+-:-:-:-:00 SHR.U32 dep_thd_mask, dep_thd_mask, mask_shr;
+
+LUT_LOOP:
+
+// warp synchronous loop while warp_count < RST
+-:-:-:-:00 ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
+-:-:-:-:00 IADD warp_count, warp_count, 32;
+// t =  rst / RS
+// rs = rst % RS
+-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ;
+-:-:-:-:00 SHR.U32   t, t, param_shift_RS;
+-:-:-:-:00 IMAD  rs, t, param_RS, RZ;
+-:-:-:-:00 IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ;
+-:-:-:-:00 SHR.U32   r, r, param_shift_S;
+-:-:-:-:00 IMAD   s, r, param_S, RZ;
+-:-:-:-:00 IADD   s, -s, rs;
+// x = qs + s
+// y = pr + r
+// z = mt + t
+-:-:-:-:00 IADD x, qs, s;
+-:-:-:-:00 IADD y, pr, r;
+-:-:-:-:00 IADD z, mt, t;
+-:-:-:-:00 ISETP.GE.AND  P4, PT, x, RZ, PT;
+-:-:-:-:00 ISETP.GE.AND  P5, PT, y, RZ, PT;
+-:-:-:-:00 ISETP.GE.AND  P6, PT, z, RZ, PT;
+-:-:-:-:00 ISETP.LT.AND  P4, PT, x, param_W, P4;
+-:-:-:-:00 ISETP.LT.AND  P5, PT, y, param_H, P5;
+-:-:-:-:00 ISETP.LT.AND  P6, PT, z, param_D, P6;
+-:-:-:-:00 PSETP.AND.AND P1, PT, P4, P5, P6;
+
+// sliceI = z*HWN + y*WN + x*N
+// rst                    N
+// --------------         --------------
+// --------------         --------------
+// --------------         --------------
+// -------------- K * rst --------------
+// --------------         --------------
+// --------------         --------------
+-:-:-:-:00 IMAD sliceI, x, param_N,   RZ;
+-:-:-:-:00 IMAD.U32.U32 sliceI, y, param_WN, sliceI;
+-:-:-:-:00 IMAD.U32.U32 sliceI, z, param_HWN, sliceI;
+
+// sliceF = rst * K
+-:-:-:-:00 IMAD sliceF, rst, param_K, RZ;
+
+// Get a mask of all valid slices in the warp
+-:-:-:-:00 VOTE.ANY ballot, PT, P1;
+// Count the total valid slices
+-:-:-:-:00 POPC warp_slices, ballot, ballot;
+// Prepare lutStore for this and next loop
+// lutStore = lutStore2
+// lutStore2 = warp_slices * 8 + lutStore2
+-:-:-:-:00 @P1 MOV lutStore, lutStore2;
+-:-:-:-:00 ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+// bit(dep_thd_mask) = tid
+// bit(ballot) = valid tid
+// dep_thd_cnt = number of bit below ballot
+-:-:-:-:00 @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+-:-:-:-:00 @P1 POPC dep_thd_cnt, dep_thd_bits, dep_thd_bits;
+// use the rst increment to space the barrier sync
+// rst = rst + 32
+-:-:-:-:00 IADD rst, rst, 32;
+// Update the lutStore address from this count
+// lutStore = dep_thd_cnt * 8 + lutStore
+-:-:-:-:00 @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+-:-:-:-:00 @P1 STS.64 [lutStore + addr_lut], sliceIF;
+
+// Keep track of the total size of the lut
+// lutSize = lutSize + warp_slices
+-:-:-:-:00 IADD lutSize, lutSize, warp_slices;
+
+-:-:-:-:00 @P0 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+-:-:-:-:00 STS [RZ + addr_szLut], lutSize;
+
+// if tid >= 32, directly enter it
+END_SETUP:
+
+-:-:-:-:00 BAR.SYNC 0;
+
+// Grab the caclulated lut size and get it's reciprical
+// Get the total reduction depth
+-:-:-:-:00 LDS lutSize, [RZ + addr_szLut];
+
+// endCRST = lutSize * param_C (channel)
+-:-:-:-:00 IMAD endCRST, lutSize, param_C, RZ;
+// lutSizeRcp = 1 / lutSize
+-:-:-:-:00 I2F.F32.S32 lutSizeRcp, lutSize;
+-:-:-:-:00 MUFU.RCP lutSizeRcp, lutSizeRcp;
+
+// posCRST = endCRST - tidY - 1
+-:-:-:-:00 IADD posCRST, endCRST, -1;
+-:-:-:-:00 IADD posCRST, posCRST, -tidY;
+
+// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 8 then make a full 8 line fetch.
+-:-:-:-:00 LOP.AND partial, endCRST, 7;
+-:-:-:-:00 ISETP.EQ.AND P1, PT, RZ, partial, PT;
+// If partial == 0
+-:-:-:-:00 @P1 MOV32I partial, 8;
+// channel = lower(posCRST / lutSize)
+// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
+-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST;
+-:-:-:-:00 FMUL channel, posCRSTf, lutSizeRcp;
+-:-:-:-:00 FFMA channel, channel, 5.9604644775390625e-08, channel;
+-:-:-:-:00 F2I.S32.F32.TRUNC channel, channel;
+
+// lutOffset = (posCRST % lutSize) * 8
+-:-:-:-:00 IMAD tmp_param0, channel, lutSize, RZ;
+-:-:-:-:00 IADD lutOffset, -tmp_param0, posCRST;
+-:-:-:-:00 SHL lutOffset, lutOffset, 3;
+// P1 = tidY < partial
+-:-:-:-:00 ISETP.LT.AND P1, PT, tidY, partial, PT;
+
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+-:-:-:-:00 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ;
+-:-:-:-:00 IMAD offsetFc, channel, param_KRST, RZ;
+// posCRST -= partial
+-:-:-:-:00 IADD posCRST, posCRST, -partial;
+-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut];
+
+// trackF = offsetFK + offsetFC + sliceF + param_F
+-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc;
+-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF;
+// trackI = offsetIN + offsetIC + sliceI + param_I
+-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc;
+-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI;
+
+//-:-:-:-:00 @P1 LEA      trackF0.CC, offsetF, param_F[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;
+-:-:-:-:00 MOV addressF0, param_F[0];
+-:-:-:-:00 MOV addressF1, param_F[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2;
+-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, addressF0;
+-:-:-:-:00 @P1 IADD.X trackF1, RZ, addressF1;
+//-:-:-:-:00 @P1 LEA      trackI0.CC, offsetI, param_I[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;
+-:-:-:-:00 MOV addressI0, param_I[0];
+-:-:-:-:00 MOV addressI1, param_I[1];
+-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2;
+-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, addressI0;
+-:-:-:-:00 @P1 IADD.X trackI1, RZ, addressI1;
+
+-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF + 4x<0>];
+-:-:-:-:00 @P1 LD.E.128 loadF4, [trackF + 4x<32>];
+-:-:-:-:00 @!P1 LDS.128 loadF0, [RZ + addr_zero];
+-:-:-:-:00 @!P1 LDS.128 loadF4, [RZ + addr_zero];
+
+-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI + 4x<0>];
+-:-:-:-:00 @P1 LD.E.128 loadI4, [trackI + 4x<32>];
+-:-:-:-:00 @!P1 LDS.128 loadI0, [RZ + addr_zero];
+-:-:-:-:00 @!P1 LDS.128 loadI4, [RZ + addr_zero];
+
+-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST, RZ, PT;
+
+-:-:-:-:00 STS.128 [writeS + 4x<0*64 +  0>], loadF0;
+-:-:-:-:00 STS.128 [writeS + 4x<0*64 + 32>], loadF4;
+
+-:-:-:-:00 STS.128 [writeS + 4x<8*64 +  0>], loadI0;
+-:-:-:-:00 STS.128 [writeS + 4x<8*64 + 32>], loadI4;
+
+-:-:-:-:00 I2F.F32.S32 posCRSTf, posCRST;
+
+-:-:-:-:00 BAR.SYNC 0;
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<64*8*2>;
+
+-:-:-:-:00 LDS.128 j0Ix0, [readIs + 4x<0*64 + 00>];
+-:-:-:-:00 LDS.128 j0Fy0, [readFs + 4x<0*64 + 00>];
+
+-:-:-:-:00 LDS.128 j0Ix4, [readIs + 4x<0*64 + 32>];
+-:-:-:-:00 LDS.128 j0Fy4, [readFs + 4x<0*64 + 32>];
+
+// channel = posCRST / lutSize
+-:-:-:-:00 @P1 FMUL channel, posCRSTf, lutSizeRcp;
+-:-:-:-:00 @P1 FFMA channel, channel, 5.9604644775390625e-08, channel;
+-:-:-:-:00 @P1 F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+-:-:-:-:00 @P1 IMAD tmp_param0, channel, lutSize, RZ;
+-:-:-:-:00 @P1 IADD lutOffset, -tmp_param0, posCRST;
+-:-:-:-:00 @P1 SHL lutOffset, lutOffset, 3;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+-:-:-:-:00 @P1 IMAD.U32.U32 offsetIc, channel, param_DHWN, RZ;
+-:-:-:-:00 @P1 IMAD      offsetFc, channel, param_KRST, RZ;
+
+-:-:-:-:00 IADD posCRST, posCRST, -8;
+-:-:-:-:00 @P1 LDS.64 sliceIF, [lutOffset + addr_lut];
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+// trackF = offsetFK + offsetFC + sliceF + param_F
+-:-:-:-:00 @P1 IADD offsetF, offsetFk, offsetFc;
+-:-:-:-:00 @P1 IADD offsetF, offsetF, sliceF;
+-:-:-:-:00 @P1 IADD offsetI, offsetIn, offsetIc;
+-:-:-:-:00 @P1 IADD offsetI, offsetI, sliceI;
+
+//-:-:-:-:00 @P1 LEA      trackF0.CC, offsetF, param_F[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;
+-:-:-:-:00 @P1 SHL tmp_shl, offsetF, 0x2;
+-:-:-:-:00 @P1 IADD trackF0.CC, tmp_shl, addressF0;
+-:-:-:-:00 @P1 IADD.X trackF1, RZ, addressF1;
+
+//-:-:-:-:00 @P1 LEA      trackI0.CC, offsetI, param_I[0],     2;
+//-:-:-:-:00 @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;
+-:-:-:-:00 @P1 SHL tmp_shl, offsetI, 0x2;
+-:-:-:-:00 @P1 IADD trackI0.CC, tmp_shl, addressI0;
+-:-:-:-:00 @P1 IADD.X trackI1, RZ, addressI1;
+
+-:-:-:-:00 @P1 LD.E.128 loadF0, [trackF + 4x< 0>];
+-:-:-:-:00 @P1 LD.E.128 loadF4, [trackF + 4x<32>];
+-:-:-:-:00 @P1 LD.E.128 loadI0, [trackI + 4x< 0>];
+-:-:-:-:00 @P1 LD.E.128 loadI4, [trackI + 4x<32>];
+
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+
+LOOP:
+
+<CODE>
+    my %insert =
+    (
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c53 => "-:-:-:-:00 \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+        j0c61 => "-:-:-:-:00 \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c62 => "-:-:-:-:00 ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j1c47 => "-:-:-:-:00 \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j1c53 => "-:-:-:-:00 \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+        j1c61 => "-:-:-:-:00 \@P1 IMAD lutOffset, -channel, lutSize, posCRST;\n",
+        j1c62 => "-:-:-:-:00 \@P1 IMAD offsetF, channel, param_KRST, offsetFk;\n",
+        j1c63 => "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 \@P1 SHL lutOffset, lutOffset, 3;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 IADD posCRST, posCRST, -8;\n".
+                 "-:G:D:-:00 \@P1 LDS.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j2c47 => "-:-:-:-:00 \@P1 IMAD.U32.U32 offsetI, channel, param_DHWN, offsetIn;\n",
+        j2c53 => "-:-:-:-:00 TEXDEPBAR 0x0;\n",
+        j2c61 => "-:-:-:-:00 \@P1 IADD offsetF, offsetF, sliceF;\n",
+        j2c62 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*64 +  0>], loadI0;\n",
+        j2c63 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*64 + 32>], loadI4;\n",
+
+        j3c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetF, 0x2;\n",
+        j3c53 => "-:-:-:-:00 \@P1 IADD offsetI, offsetI, sliceI;\n",
+        j3c61 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*64 +  0>], loadF0;\n",
+        j3c62 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*64 + 32>], loadF4;\n",
+        j3c63 => "-:-:-:-:00 \@P1 IADD trackF0.CC, tmp_shl, addressF0;\n",
+
+        j4c47 => "-:-:-:-:00 \@P1 IADD.X trackF1, RZ, addressF1;\n",
+        j4c53 => "-:-:-:-:00 \@P1 IADD tmp_param0.CC, tmp_data, trackF0;\n",
+        j4c61 => "-:-:-:-:00 \@P1 IADD.X tmp_param1, RZ, trackF1;\n",
+
+        j5c47 => "-:-:-:-:00 \@P1 SHL tmp_shl, offsetI, 0x2;\n",
+
+        j5c61 => "-:-:-:-:00 \@P1 IADD trackI0.CC, tmp_shl, addressI0;\n",
+        j5c62 => "-:G:D:-:00 \@P1 LDG.E.128 loadF0, [trackF + 4x< 0>];\n",
+        j5c63 => "-:G:D:-:00 \@P1 LDG.E.128 loadF4, [tmp_param];\n",
+
+        j6c47 => "-:-:-:-:00 \@P1 IADD.X trackI1, RZ, addressI1;\n",
+        j6c53 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n",
+
+        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n",
+        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n",
+        j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n",
+
+        j7c47 => "-:-:-:-:00 \@P1 IADD tmp_param0.CC, tmp_data, trackI0;\n",
+        j7c53 => "-:-:-:-:00 \@P1 IADD.X tmp_param1, RZ, trackI1;\n",
+        j7c61 => "-:G:D:-:00 \@P1 LDG.E.128 loadI0, [trackI];\n",
+        j7c62 => "-:G:D:-:00 \@P1 LDG.E.128 loadI4, [tmp_param];\n",
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n",
+    );
+
+    my @cOrder;
+
+    push  @cOrder, [0,0];
+    push  @cOrder, [0,1];
+    push  @cOrder, [1,1];
+    push  @cOrder, [2,0];
+    push  @cOrder, [1,0];
+    push  @cOrder, [2,1];
+    push  @cOrder, [2,3];
+    push  @cOrder, [2,2];
+    push  @cOrder, [1,2];
+    push  @cOrder, [0,3];
+    push  @cOrder, [1,3];
+    push  @cOrder, [0,2];
+    push  @cOrder, [0,4];
+    push  @cOrder, [0,5];
+    push  @cOrder, [1,5];
+    push  @cOrder, [2,4];
+    push  @cOrder, [1,4];
+    push  @cOrder, [2,5];
+    push  @cOrder, [2,7];
+    push  @cOrder, [2,6];
+    push  @cOrder, [1,6];
+    push  @cOrder, [0,7];
+    push  @cOrder, [1,7];
+    push  @cOrder, [0,6];
+    push  @cOrder, [3,6];
+    push  @cOrder, [3,7];
+    push  @cOrder, [4,7];
+    push  @cOrder, [5,6];
+    push  @cOrder, [4,6];
+    push  @cOrder, [5,7];
+    push  @cOrder, [5,5];
+    push  @cOrder, [5,4];
+    push  @cOrder, [4,4];
+    push  @cOrder, [3,5];
+    push  @cOrder, [4,5];
+    push  @cOrder, [3,4];
+    push  @cOrder, [3,2];
+    push  @cOrder, [3,3];
+    push  @cOrder, [4,3];
+    push  @cOrder, [5,2];
+    push  @cOrder, [4,2];
+    push  @cOrder, [5,3];
+    push  @cOrder, [5,1];
+    push  @cOrder, [5,0];
+    push  @cOrder, [4,0];
+    push  @cOrder, [3,1];
+    push  @cOrder, [4,1];
+    push  @cOrder, [3,0];
+    push  @cOrder, [6,0];
+    push  @cOrder, [7,0];
+    push  @cOrder, [7,1];
+    push  @cOrder, [6,2];
+    push  @cOrder, [6,1];
+    push  @cOrder, [7,2];
+    push  @cOrder, [7,5];
+    push  @cOrder, [6,5];
+    push  @cOrder, [6,4];
+    push  @cOrder, [7,3];
+    push  @cOrder, [7,4];
+    push  @cOrder, [6,3];
+    push  @cOrder, [6,6];
+    push  @cOrder, [6,7];
+    push  @cOrder, [7,7];
+    push  @cOrder, [7,6]; 
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c5"}  = sprintf "-:G:D:-:01 %s LDS.64 j%dIx0, [readIs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx2, [readIs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx4, [readIs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dIx6, [readIs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy0, [readFs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy2, [readFs + 4x<%d*64 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy4, [readFs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dFy6, [readFs + 4x<%d*64 + 34>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $ctrl   = "-:-:-:-:00";
+
+            if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) {
+              $ins = "-:G:D:-:00 NOP;\n";   
+            }
+
+            if ($c > 60 && !$ins){
+              $ins = "-:-:D:-:07 NOP;\n";
+            }
+
+            # 04 and 05 are dual issued
+            if($ins) {
+              $ctrl = "-:-:D:-:04";
+            } else {
+              if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){
+                $ctrl = "-:-:D:-:04";
+              }
+              else{
+                $ctrl = "-:-:D:-:05";
+              }
+            }
+
+            $out .= sprintf "%s FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 LDS.128 mpq, [RZ + addr_m];
+-:-:-:-:00 S2R tid,  SR_TID.X; // 0-64
+-:-:-:-:00 S2R blkI, SR_CTAID.Z; // N id
+-:-:-:-:00 S2R blkF, SR_CTAID.Y; // K id
+
+// tidOX = (tid & 7) << 2
+// tidOY = tid >> 3
+-:-:-:-:00 LOP.AND tidOX, tid,   7;
+-:-:-:-:00 SHL     tidOX, tidOX, 2;
+-:-:-:-:00 SHR.U32 tidOY, tid,   3;
+
+-:-:-:-:00 LOP.AND readIs, readIs, 0x7ff;
+-:-:-:-:00 LOP.AND readFs, readFs, 0x7ff;
+
+// Div by 4 here collapses k stride
+// writeCs = readFs * 16 + readIs;
+-:-:-:-:00 ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs  = 4 * (tidOX + (tidOY * 64))
+-:-:-:-:00 ISCADD readCs, tidOY, tidOX, 6;
+-:-:-:-:00 SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+-:-:-:-:00 ISCADD n, blkI, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*64 + tidOY * 4
+-:-:-:-:00 SHL    tidOY,   tidOY, 2;
+-:-:-:-:00 ISCADD k, blkF, tidOY, 6;
+
+// o = k*MPQN + m*PQN + p*QN + q*N + n
+-:-:-:-:00 IMAD      to, q, param_N,    n;
+-:-:-:-:00 IMAD.U32.U32 to, p, param_QN,   to;
+-:-:-:-:00 IMAD.U32.U32 to, m, param_PQN,  to;
+-:-:-:-:00 IMAD.U32.U32 to, k, param_MPQN, to;
+-:-:-:-:00 MOV tmp_param0, param_O[0];
+-:-:-:-:00 MOV tmp_param1, param_O[1];
+-:-:-:-:00 SHL tmp_shl, to, 0x2;
+-:-:-:-:00 IADD Out0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X Out1, RZ, tmp_param1;
+
+-:-:-:-:00 MOV  MPQN,  param_MPQN;
+-:-:-:-:00 SHL  MPQN1, MPQN, 2;
+-:-:-:-:00 SHL  MPQN4, MPQN, 4;
+-:-:-:-:00 ISCADD MPQN28, MPQN, -MPQN4, 7;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, n, param_N, PT; // n +  0 < N
+-:-:-:-:00 IADD n, n, 32;
+-:-:-:-:00 ISETP.LT.AND P1, PT, n, param_N, PT; // n + 32 < N
+
+-:-:-:-:00 MOV alpha, param_alpha;
+
+-:-:-:-:00 BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        if ($y == 4)
+        {
+            $out .= sprintf(
+                "-:-:-:-:00 IADD Out0.CC, Out0, MPQN28;\n" .
+                "-:-:-:-:00 IADD k, k, 28;\n" .
+                "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n" .
+                "-:-:-:-:00 IADD.X Out1, Out1, RZ;\n\n",
+                ($y) x 8);
+        }
+        else
+        {
+            $out .= sprintf(
+                "-:-:-:-:00 FMUL cs0, cx0y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs1, cx1y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs2, cx2y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs3, cx3y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs4, cx4y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs5, cx5y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs6, cx6y%d, alpha;\n" .
+                "-:-:-:-:00 FMUL cs7, cx7y%d, alpha;\n\n",
+                ($y) x 8);
+        }
+
+        $out .= "-:-:-:-:00 CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 EXIT;
+
+STORE_C:
+
+-:-:-:-:00 ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n +  0 < N
+-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n + 32 < N
+-:-:-:-:00 IADD k, k, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+-:-:-:-:00 STS.128 [writeCs + 4x<00>], cs0;
+-:-:-:-:00 STS.128 [writeCs + 4x<32>], cs4;
+-:-:-:-:00 LDS.128 cs0, [readCs + 4x<00>];
+-:-:-:-:00 LDS.128 cs4, [readCs + 4x<32>];
+
+// Store results back to global
+-:-:-:-:00 @P2 ST.E.128 [Out + 4x<00>], cs0;
+-:-:-:-:00 @P3 ST.E.128 [Out + 4x<32>], cs4;
+
+-:-:-:-:00 IADD   Out0.CC, Out0, MPQN1;
+-:-:-:-:00 IADD.X Out1,    Out1, RZ;
+
+-:-:-:-:00 RET;
+
diff --git a/Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin
new file mode 100644
index 0000000..999e19e
Binary files /dev/null and b/Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin differ
diff --git a/Kernel/Convolution/Kepler/sconv_update.cu b/Kernel/Convolution/Kepler/sconv_update.cu
new file mode 100644
index 0000000..ab88d60
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_update.cu
@@ -0,0 +1,149 @@
+#include "sconv.h"
+
+bool update(const float *I, float *F, const float *O,
+  unsigned int N, unsigned int C, unsigned int K,
+  unsigned int D, unsigned int H, unsigned int W,
+  unsigned int R, unsigned int S, unsigned int T,
+  unsigned int M, unsigned int P, unsigned int Q,
+  unsigned int str_d, unsigned int str_h, unsigned int str_w,
+  unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) {
+  float alpha = 1.0f;
+  unsigned int DHW, WN, HW, HWN, DHWN, CRST, RST, RS;
+  unsigned int PQ, QN, PQN, MPQN;
+  unsigned int magic_HW, magic_W;
+  unsigned int shift_HW, shift_W;
+  unsigned int magic_RST, magic_RS, magic_S;
+  unsigned int shift_RST, shift_RS, shift_S;
+  unsigned int magic_PQu, shift_PQu;
+  unsigned int magic_Qu, shift_Qu;
+  unsigned int grid_P = 1;
+  unsigned int grid_Q = 1;
+  unsigned int grid_PQ = grid_P * grid_Q;
+  unsigned int grid_PQM = grid_PQ * M;
+  // input
+  WN = W * N;
+  HW = H * W;
+  HWN = H * WN;
+  DHW = D * HW;
+  DHWN = D * HWN;
+  // filter
+  RS = R * S;
+  RST = T * RS;
+  CRST = C * RS;
+  // output
+  QN = Q * N;
+  PQN = P * QN;
+  MPQN = M * PQN;
+  // magic numbers
+  magic32(CRST, RST, magic_RST, shift_RST);
+  magic32(RST + 32, RS, magic_RS, shift_RS);
+  magic32(RS + 32, S, magic_S, shift_S);
+  magic32(DHW, HW, magic_HW, shift_HW);
+  magic32(HW, W, magic_W, shift_W);
+  magic32(grid_PQM, grid_PQ, magic_PQu, shift_PQu);
+  magic32(grid_PQ, grid_Q, magic_Qu, shift_Qu);
+  std::cout << "CRST: " << CRST << std::endl;
+  // test param set up
+  float *test_param;
+  cudaError_t cuda_error;
+  cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024);
+  cudaMemset(test_param, 0, sizeof(float) * 1024);
+  void *args[43] = {&test_param, &F, &I, &O, &alpha,
+    &N, &K, &D, &H, &W, &WN, &HWN, &DHWN,
+    &C, &CRST,
+    &RST, &magic_RST, &shift_RST,
+    &RS, &magic_RS, &shift_RS,
+    &S, &magic_S, &shift_S,
+    &pad_d, &pad_h, &pad_w,
+    &str_d, &str_h, &str_w,
+    &P, &Q, &PQ, &QN, &PQN, &MPQN,
+    &magic_Qu, &shift_Qu,
+    &magic_PQu, &shift_PQu,
+    &grid_P, &grid_Q, &grid_PQ};
+  int gridX = grid_PQM;
+  int gridY = CRST / 128 + (CRST % 128 != 0);
+  int gridZ = K / 128 + (K % 128 != 0);
+  std::string kernel_name = "sconv_update_C128_K128";
+  CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 256, 1, 1,
+    0, 0, args, NULL);
+  if (res != CUDA_SUCCESS) {
+    std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl;
+    return false;
+  }
+  cuCtxSynchronize();
+  float* h_test = (float *)malloc(sizeof(float) * 256);
+  cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 256, cudaMemcpyDeviceToHost);
+  if (cuda_error != cudaSuccess) {
+    std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
+    exit(1);
+  }
+  for (int i = 0; i < 256; ++i) {
+    std::cout << h_test[i] << " ";
+  }
+  std::cout << std::endl;
+  // free test_param
+  free(h_test);
+  return true;
+}
+
+int main() {
+  cudaFree(0);
+  float *d_I, *d_F, *d_O;
+  unsigned int N = 128, C = 3, K = 128, D = 1, H = 224, W = 224, T = 1, R = 11, S = 11;
+  unsigned int str_d = 1, str_h = 4, str_w = 4;
+  unsigned int pad_d = 0, pad_h = 3, pad_w = 3;
+  unsigned int M, P, Q;
+  cudaError_t cuda_error;
+  M = (D - T + 2 * pad_d) / str_d + 1;
+  P = (H - R + 2 * pad_h) / str_h + 1;
+  Q = (W - S + 2 * pad_w) / str_w + 1;
+  float *h_O = (float *)malloc(K * M * P * Q * N * sizeof(float));
+  for (int i = 0; i < K * M * P * Q * N; ++i) {
+    h_O[i] = 1;
+  }
+  float *h_I = (float *)malloc(C * D * H * W * N * sizeof(float));
+  for (int i = 0; i < C * D * H * W * N; ++i) {
+    h_I[i] = 1;
+  }
+  float* h_F = (float *)malloc(sizeof(float) * C * R * S * T * K);
+  // device memory
+  cudaMalloc((void**)&d_I, sizeof(float) * C * D * H * W * N);
+  cudaMalloc((void**)&d_F, sizeof(float) * C * R * S * T * K);
+  cudaMalloc((void**)&d_O, sizeof(float) * K * M * P * Q * N);
+  // memcpy h_I, h_O
+  cudaMemcpy(d_I, h_I, sizeof(float) * C * D * H * W * N,
+    cudaMemcpyHostToDevice);
+  cudaMemcpy(d_O, h_O, sizeof(float) * K * M * P * Q * N,
+    cudaMemcpyHostToDevice);
+  // load kernels 
+  if (!load_kernels("./")) {
+    std::cerr << "Couldn't load all kernels" << std::endl;
+    exit(1);
+  }
+  // launch kernel
+  if (!update(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) {
+    std::cerr << "Launch error" << std::endl;
+    exit(1);
+  }
+  // output
+  std::cout << "result" << std::endl;
+  cuda_error = cudaMemcpy(h_F, d_F, sizeof(float) * C * R * S * T * K, cudaMemcpyDeviceToHost);
+  if (cuda_error != cudaSuccess) {
+    std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
+    exit(1);
+  }
+  for (int i = 0; i < 128; ++i) {
+    std::cout << h_F[i] << " ";
+  }
+  std::cout << std::endl;
+  // free memory
+  free(h_O);
+  free(h_I);
+  free(h_F);
+  cudaFree(d_I);
+  cudaFree(d_F);
+  cudaFree(d_O);
+  // run successfully
+  std::cout << "finish" << std::endl;
+  return 0;
+}
diff --git a/Kernel/Convolution/Kepler/sconv_update_C128_K128.cu b/Kernel/Convolution/Kepler/sconv_update_C128_K128.cu
new file mode 100644
index 0000000..c8f3e35
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_update_C128_K128.cu
@@ -0,0 +1,54 @@
+extern "C"
+__global__ void sconv_update_C128_K128 (
+    float* param_test,
+    float* param_F,
+    const float* param_I,
+    const float* param_E,
+    float param_alpha,
+    int param_N,
+    int param_K,
+    int param_D,
+    int param_H,
+    int param_W,
+    int param_WN,
+    int param_HWN,
+    int param_DHWN,
+    int param_C,
+    int param_CRST,
+    int param_RST,
+    int param_magic_RST,
+    int param_shift_RST,
+    int param_RS,
+    int param_magic_RS,
+    int param_shift_RS,
+    int param_S,
+    int param_magic_S,
+    int param_shift_S,
+    int param_pad_d,
+    int param_pad_h,
+    int param_pad_w,
+    int param_str_d,
+    int param_str_h,
+    int param_str_w,
+    int param_P,
+    int param_Q,
+    int param_PQ,
+    int param_QN,
+    int param_PQN,
+    int param_MPQN,
+    int param_magic_Q,
+    int param_shift_Q,
+    int param_magic_PQ,
+    int param_shift_PQ,
+    int param_part_P,
+    int param_part_Q,
+    int param_part_PQ) {
+      __shared__ float share[(128 * 16 + 32) * 4 + 6];
+
+      int tid = threadIdx.x;
+
+      share[tid] = 1;
+
+      *param_F = share[255 - tid];
+      *param_test = share[255 - tid];
+    }
diff --git a/Kernel/Convolution/Kepler/sconv_update_C128_K128.sass b/Kernel/Convolution/Kepler/sconv_update_C128_K128.sass
new file mode 100644
index 0000000..38f0810
--- /dev/null
+++ b/Kernel/Convolution/Kepler/sconv_update_C128_K128.sass
@@ -0,0 +1,720 @@
+# Kernel: sconv_update_C128_K128
+// debug:
+// mode1
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//-:-:-:-:00 SHL tmp_shl, tid, 0x2;
+//-:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0;
+//-:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1;
+//-:-:-:-:00 I2F.F32.U32 rst, rst;
+//-:-:-:-:00 ST.E [tmp_param00], rst;
+//-:-:-:-:00 EXIT;
+
+// mode2
+//-:-:-:-:00 MOV tmp_param0, param_test[0];
+//-:-:-:-:00 MOV tmp_param1, param_test[1];
+//
+//-:-:-:-:00 MOV32I k, 0x40000000;
+//-:-:-:-:00 ST.E [tmp_param0], k;
+//-:-:-:-:00 EXIT;
+
+// modify steps:
+// XMAD->IMAD
+// shared memory addresses->RZ
+// LDG->LD
+// LEA->MOV, IADD, SHL
+// XMAD.LO2C->IMAD.U32.U32
+// XMAD.PSL->IMAD.U32.U32
+// VMAD->IMAD, IADD
+// MOV->MOV32I
+// IADD3->IADD, IADD
+// POPC
+// ST.CG->ST
+// control code
+// comments
+// LDS.U->LDS
+// LOP3
+// register<0-7>->register<0-3>, register<4-7>
+// avoid register conflicts
+// PT: 0xffffff
+
+// initial->1200
+// bank conflict->1288
+// alignment+dual issue+reuse->1600
+// half ldg.128->1700
+// all ldg.128->1777
+// control codes->1900
+// scheduling->1937
+// reduce unnecessary instructions->2100
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128 * 16 + 32) * 4 + 0>
+    addr_m    : 4x<(128 * 16 + 32) * 4 + 4>
+    addr_q    : 4x<(128 * 16 + 32) * 4 + 5>
+    szBuf     : (128 * 16 + 32)
+
+    param_test[0]   : c[0x0][0x140]
+    param_test[1]   : c[0x0][0x144]
+    param_F[0]      : c[0x0][0x148]
+    param_F[1]      : c[0x0][0x14c]
+    param_I[0]      : c[0x0][0x150]
+    param_I[1]      : c[0x0][0x154]
+    param_E[0]      : c[0x0][0x158]
+    param_E[1]      : c[0x0][0x15c]
+    param_alpha     : c[0x0][0x160]
+    param_N         : c[0x0][0x164]
+    param_K         : c[0x0][0x168]
+    param_D         : c[0x0][0x16c]
+    param_H         : c[0x0][0x170]
+    param_W         : c[0x0][0x174]
+    param_WN        : c[0x0][0x178]
+    param_HWN       : c[0x0][0x17c]
+    param_DHWN      : c[0x0][0x180]
+    param_C         : c[0x0][0x184]
+    param_CRST      : c[0x0][0x188]
+    param_RST       : c[0x0][0x18c]
+    param_magic_RST : c[0x0][0x190]
+    param_shift_RST : c[0x0][0x194]
+    param_RS        : c[0x0][0x198]
+    param_magic_RS  : c[0x0][0x19c]
+    param_shift_RS  : c[0x0][0x1a0]
+    param_S         : c[0x0][0x1a4]
+    param_magic_S   : c[0x0][0x1a8]
+    param_shift_S   : c[0x0][0x1ac]
+    param_pad_d     : c[0x0][0x1b0]
+    param_pad_h     : c[0x0][0x1b4]
+    param_pad_w     : c[0x0][0x1b8]
+    param_str_d     : c[0x0][0x1bc]
+    param_str_h     : c[0x0][0x1c0]
+    param_str_w     : c[0x0][0x1c4]
+    param_P         : c[0x0][0x1c8]
+    param_Q         : c[0x0][0x1cc]
+    param_PQ        : c[0x0][0x1d0]
+    param_QN        : c[0x0][0x1d4]
+    param_PQN       : c[0x0][0x1d8]
+    param_MPQN      : c[0x0][0x1dc]
+    param_magic_Q   : c[0x0][0x1e0]
+    param_shift_Q   : c[0x0][0x1e4]
+    param_magic_PQ  : c[0x0][0x1e8]
+    param_shift_PQ  : c[0x0][0x1ec]
+    param_grid_P    : c[0x0][0x1f0]
+    param_grid_Q    : c[0x0][0x1f4]
+    param_grid_PQ   : c[0x0][0x1f8]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-67 ~ blkI, blkE
+    68-111 ~ tidX, tidY, tid1, tid7, tid128, shiftX, blkMPQ, pq, m
+
+    64-95 ~ tidYY, mm, mt, pr, y, z, y0, yH, z0, zD, bounds_yz, c, r, t, rs, rst
+    64-95 ~ qs, x, x0, xW, bounds_x, ti, te, Q
+
+    64-67 : j0Ex<0-3>
+    68-71 : j0Iy<0-3>
+    72-75 : j0Ex<4-7>
+    76-79 : j0Iy<4-7>
+    80-83 : j1Ex<0-3>
+    84-87 : j1Iy<0-3>
+    88-91 : j1Ex<4-7>
+    92-95 : j1Iy<4-7>
+
+    96-99   : loadI<0-3>
+    100-103 : loadE<0-3>
+    104-107 : loadI<4-7>
+    108-111 : loadE<4-7>
+
+    112-115 : trackI<0-1>, trackE<0-1>
+
+    116-124 ~ writeS, loopN, e, i, p, q, k, crst, s
+    125-127 : swapBuf, readIs, readEs
+    128-129 : tmp_data, tmp_shl
+    130-131 : tmp_param0, tmp_param1
+    132 : p_and
+    133 : tid
+    134-135 : tmp_param0<0-1>
+
+     68-83  : c<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+    84-124  ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,    SR_TID.X;
+-:-:-:-:00 S2R blkMPQ, SR_CTAID.X; // M
+-:-:-:-:00 S2R blkI,   SR_CTAID.Y; // CRST / 128
+-:-:-:-:00 S2R blkE,   SR_CTAID.Z; // K / 128
+
+// tidX = tid >> 1
+// tidX = 0 : 1 : 128
+// tidY = (tid & 1) << 2
+// tidY = 0, 4
+// shiftX = (tid & 1) << 4
+// shiftX = 0, 16
+-:-:-:-:00 LOP.AND tid1,   tid,  1;
+-:-:-:-:00 SHR.U32 tidX,   tid,  1;
+-:-:-:-:00 SHL     tidY,   tid1, 2;
+-:-:-:-:00 SHL     shiftX, tid1, 4;
+
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+
+<CODE>
+    return join '', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+-:-:-:-:00 PSETP.AND.AND P0, PT, PT, PT, PT;
+
+// m  = blkMPQ / PQ
+// pq = blkMPQ % PQ
+// m = 0 : 1 : M - 1;
+// PQ = 1;
+// pq = 0
+-:-:-:-:00 IMAD.U32.U32 m, blkMPQ, param_magic_PQ, RZ;
+-:-:-:-:00 SHR.U32   m, m, param_shift_PQ;
+-:-:-:-:00 IMAD pq,  m, param_grid_PQ, RZ;
+-:-:-:-:00 IADD pq, -pq, blkMPQ;
+// p = pq / Q
+// q = pq % Q
+// p = 0
+// q = 0
+-:-:-:-:00 IMAD.U32.U32 p, pq, param_magic_Q, RZ;
+-:-:-:-:00 SHR.U32   p, p, param_shift_Q;
+-:-:-:-:00 IMAD  q,  p, param_grid_Q, RZ;
+-:-:-:-:00 IADD  q, -q, pq;
+
+// We need to be able to restore m and q at each P iteration
+// Register spill to shared
+-:-:-:-:00 STS [RZ + addr_m], m;
+-:-:-:-:00 STS [RZ + addr_q], q;
+
+// tidX = 0 : 1 : 127
+// tidY = 0, 4
+// shiftX = 0, 16
+// writeS <= (512 + 128 + 16) * 4
+// if tidY > 512, shiftX = 16
+// writeS = (tidY * 128 + tidX + shiftX) * 4 + szBuf * 8
+//      --------------------
+//      --------------------
+// 0, 4 --------------------
+// tidY --------------------
+//      ---- tidX = 0 : 1 : 127
+
+-:-:-:-:00 ISCADD writeS, tidY, tidX, 7;
+-:-:-:-:00 IADD   writeS, writeS, shiftX;
+-:-:-:-:00 ISCADD writeS, writeS, 4x<szBuf * 2>, 2;
+
+// readIs = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+// [6][5][4][0] * 16;
+// readIs = 0 : 4 : 63
+-:-:-:-:00 LOP.AND readIs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readIs, readIs, 3;
+-:-:-:-:00 LOP.OR  readIs, readIs, tid1;
+-:-:-:-:00 SHL     readIs, readIs, 4;
+
+// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf;
+// [7][3][2][1] * 16 + szBuf * 4;
+// readEs = 0 : 4 : 63
+-:-:-:-:00 LOP.AND tid128, tid,    128;
+-:-:-:-:00 BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 SHR.U32 readEs, tid128, 4;
+-:-:-:-:00 LOP.OR  readEs, readEs, tid7;
+-:-:-:-:00 ISCADD  readEs, readEs, 4x<szBuf>, 4;
+
+-:-:-:-:00 MOV32I swapBuf, -4x<szBuf * 2>;
+
+// crst = blockI * 128 + tidX
+-:-:-:-:00 ISCADD crst, blkI, tidX, 7;
+
+// k = blockE * 128 + tidX
+-:-:-:-:00 ISCADD k, blkE, tidX, 7;
+
+// loopN = N 
+-:-:-:-:00 MOV loopN, param_N;
+
+NEXT_P:
+
+// tidYY = 0 : 1 : 255
+-:-:-:-:00 S2R tidYY, SR_TID.X;
+-:-:-:-:00 LDS mm, [RZ + addr_m];
+-:-:-:-:00 LDS q, [RZ + addr_q];
+
+// c   = crst / RST
+// rst = crst % RST
+-:-:-:-:00 IMAD.U32.U32 c, crst, param_magic_RST, RZ;
+-:-:-:-:00 SHR.U32   c, c, param_shift_RST;
+-:-:-:-:00 IMAD rst, c, param_RST, RZ;
+-:-:-:-:00 IADD rst, -rst, crst;
+
+// t  = rst / RS
+// rs = rst % RS
+-:-:-:-:00 IMAD.U32.U32 t, rst, param_magic_RS, RZ;
+-:-:-:-:00 SHR.U32   t, t, param_shift_RS;
+-:-:-:-:00 IMAD  rs, t, param_RS, RZ;
+-:-:-:-:00 IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+-:-:-:-:00 IMAD.U32.U32 r, rs, param_magic_S, RZ;
+-:-:-:-:00 SHR.U32 r, r, param_shift_S;
+-:-:-:-:00 IMAD s, r, param_S, RZ;
+-:-:-:-:00 IADD s, -s, rs;
+// y = p * u - pad_h + r
+// z = m * w - pad_d + t
+-:-:-:-:00 IMAD pr, p,  param_str_h, RZ;
+-:-:-:-:00 IMAD mt, mm, param_str_d, RZ;
+-:-:-:-:00 IADD y, pr, -param_pad_h;
+-:-:-:-:00 IADD y, y, r;
+-:-:-:-:00 IADD z, mt, -param_pad_d;
+-:-:-:-:00 IADD z, z, t;
+// e = k * MPQN + m * PQN + p * QN + tidYY
+// tidYY = 0, 4
+-:-:-:-:00 LOP.AND tidYY, tidYY, 1;
+-:-:-:-:00 SHL     tidYY, tidYY, 2;
+-:-:-:-:00 IMAD.U32.U32 e, p,  param_QN, tidYY;
+-:-:-:-:00 IMAD.U32.U32 e, mm, param_PQN, e;
+-:-:-:-:00 IMAD.U32.U32 e, k,  param_MPQN, e;
+// i = c * DHWN + z * HWN + y * WN + tidYY
+-:-:-:-:00 IMAD.U32.U32 i, y, param_WN,   tidYY;
+-:-:-:-:00 IMAD.U32.U32 i, z, param_HWN,  i;
+-:-:-:-:00 IMAD.U32.U32 i, c, param_DHWN, i;
+// mode1
+// -:-:-:-:00 MOV tmp_param0, param_test[0];
+// -:-:-:-:00 MOV tmp_param1, param_test[1];
+// -:-:-:-:00 SHL tmp_shl, tid, 0x2;
+// -:-:-:-:00 IADD tmp_param00.CC, tmp_shl, tmp_param0;
+// -:-:-:-:00 IADD.X tmp_param01, RZ, tmp_param1;
+// -:-:-:-:00 I2F.F32.U32 i, i;
+// -:-:-:-:00 ST.E [tmp_param00], i;
+// -:-:-:-:00 EXIT;
+// bounds_yz = y < 0 || y > H || z < 0 || z > D ? -1 : 0
+-:-:-:-:00 ISET.LT.AND y0, y, RZ, PT;
+-:-:-:-:00 ISET.GE.AND yH, y, param_H, PT;
+-:-:-:-:00 ISET.LT.AND z0, z, RZ, PT;
+-:-:-:-:00 ISET.GE.AND zD, z, param_D, PT;
+-:-:-:-:00 LOP.OR bounds_yz, y0, yH;
+-:-:-:-:00 LOP.OR bounds_yz, bounds_yz, z0;
+-:-:-:-:00 LOP.OR bounds_yz, bounds_yz, zD;
+// doLoadCRST = crst < CRST && bounds_yz == 0
+-:-:-:-:00 ISETP.LT.AND P4, PT, crst, param_CRST, PT;
+-:-:-:-:00 ISETP.EQ.AND P4, PT, bounds_yz, RZ, P4;
+// p += grid_P
+// p = p + 1
+-:-:-:-:00 IADD p, p, param_grid_P;
+
+-:-:-:-:00 ISETP.LT.AND P6, PT, p, param_P, PT;
+
+NEXT_Q:
+
+// Zigzag q but only if grid_P < P
+-:-:-:-:00 LOP.AND p_and, p, 1;
+// useless?
+-:-:-:-:00 ISETP.NE.AND P1, PT, RZ, p, PT;
+// Q = 1
+-:-:-:-:00 MOV Q, param_grid_P;
+// 1 < param_P ? Q = -1 + -q + paramQ : Q = 0
+-:-:-:-:00 ISETP.LT.AND P1, PT, Q, param_P, P1;
+-:-:-:-:00 MOV32I Q, -1;
+-:-:-:-:00 @P1 IADD tmp_data, -q, param_Q;
+-:-:-:-:00 @P1 IADD Q, tmp_data, Q;
+-:-:-:-:00 @!P1 MOV Q, q;
+// k < K
+-:-:-:-:00 ISETP.LT.AND P3, PT, k, param_K, PT;
+// qs = q * v - pad_w
+// x = qs + s
+-:-:-:-:00 IMAD qs, Q, param_str_w, RZ;
+-:-:-:-:00 IADD x, qs, -param_pad_w;
+-:-:-:-:00 IADD x, x, s;
+// bounds_x = x < 0 || x > W ? -1 : 0
+-:-:-:-:00 ISET.LT.AND x0, x, RZ, PT;
+-:-:-:-:00 ISET.GE.AND xW, x, param_W, PT;
+-:-:-:-:00 LOP.OR bounds_x, x0, xW;
+// doLoad = crst < CRST && bounds_yz == 0 && bounds_x == 0
+-:-:-:-:00 ISETP.EQ.AND P2, PT, bounds_x, RZ, P4;
+// trackI = I + i + x * N
+-:-:-:-:00 IMAD ti, x, param_N, i;
+//-:-:-:-:00 LEA      trackI0.CC, ti, param_I[0],     2;
+//-:-:-:-:00 LEA.HI.X trackI1,    ti, param_I[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_I[0];
+-:-:-:-:00 MOV tmp_param1, param_I[1];
+-:-:-:-:00 SHL tmp_shl, ti, 0x2;
+-:-:-:-:00 IADD trackI0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackI1, RZ, tmp_param1;
+// trackE = E + e + q * N
+-:-:-:-:00 IMAD te, Q, param_N, e;
+//-:-:-:-:00 LEA      trackE0.CC, te, param_E[0],     2;
+//-:-:-:-:00 LEA.HI.X trackE1,    te, param_E[1], RZ, 2;
+-:-:-:-:00 MOV tmp_param0, param_E[0];
+-:-:-:-:00 MOV tmp_param1, param_E[1];
+-:-:-:-:00 SHL tmp_shl, te, 0x2;
+-:-:-:-:00 IADD trackE0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackE1, RZ, tmp_param1;
+// q += grid_Q
+// q = q + 1
+-:-:-:-:00 IADD q, q, param_grid_Q;
+-:-:-:-:00 ISETP.LT.AND P5, PT, q, param_Q, PT;
+
+-:-:-:-:00 @!P0 IADD loopN, loopN, param_N;
+
+-:-:-:-:00 @!P0 BRA.U NEXT_PQ;
+
+-:-:-:-:00 PSETP.AND.AND P0, PT, PT, PT, !PT;
+
+-:-:-:-:00 @P2 LD.E.CI.128 loadI0, [trackI + 4x<0>];
+-:-:-:-:00 @P2 LD.E.CI.128 loadI4, [trackI + 4x<8>];
+-:-:-:-:00 @!P2 LDS.128 loadI0, [RZ + addr_zero];
+-:-:-:-:00 @!P2 LDS.128 loadI4, [RZ + addr_zero];
+
+-:-:-:-:00 ISETP.LE.AND P1, PT, loopN, 32, PT;
+
+-:-:-:-:00 @P3 LD.E.CI.128 loadE0, [trackE + 4x<0>];
+-:-:-:-:00 @P3 LD.E.CI.128 loadE4, [trackE + 4x<8>];
+-:-:-:-:00 @!P3 LDS.128 loadE0, [RZ + addr_zero];
+-:-:-:-:00 @!P3 LDS.128 loadE4, [RZ + addr_zero];
+
+-:-:-:-:00 STS [writeS + 4x< 0*128>], loadI0;
+-:-:-:-:00 STS [writeS + 4x< 1*128>], loadI1;
+-:-:-:-:00 STS [writeS + 4x< 2*128>], loadI2;
+-:-:-:-:00 STS [writeS + 4x< 3*128>], loadI3;
+
+-:-:-:-:00 STS [writeS + 4x< 8*128 + 16>], loadI4;
+-:-:-:-:00 STS [writeS + 4x< 9*128 + 16>], loadI5;
+-:-:-:-:00 STS [writeS + 4x<10*128 + 16>], loadI6;
+-:-:-:-:00 STS [writeS + 4x<11*128 + 16>], loadI7;
+
+-:-:-:-:00 IADD   trackI0.CC, trackI0, 4x<16>;
+-:-:-:-:00 PSETP.AND.AND P5, PT, P1, P5, PT;
+
+-:-:-:-:00 STS [writeS + 4x< 0*128 + szBuf>], loadE0;
+-:-:-:-:00 STS [writeS + 4x< 1*128 + szBuf>], loadE1;
+-:-:-:-:00 STS [writeS + 4x< 2*128 + szBuf>], loadE2;
+-:-:-:-:00 STS [writeS + 4x< 3*128 + szBuf>], loadE3;
+
+-:-:-:-:00 PSETP.AND.AND P6, PT, P1, P6, PT;
+
+-:-:-:-:00 STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4;
+-:-:-:-:00 STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5;
+-:-:-:-:00 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6;
+-:-:-:-:00 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7;
+
+-:-:-:-:00 IADD.X trackI1, trackI1, RZ;
+
+-:-:-:-:00 IADD trackE0.CC, trackE0, 4x<16>;
+
+-:-:-:-:00 IADD readEs,  readEs, -swapBuf;
+-:-:-:-:00 IADD readIs,  readIs, -swapBuf;
+-:-:-:-:00 BAR.SYNC 0;
+-:-:-:-:00 IADD writeS, writeS, swapBuf;
+-:-:-:-:00 IADD swapBuf, RZ, -swapBuf;
+
+-:-:-:-:00 IADD.X trackE1, trackE1, RZ;
+
+-:-:-:-:00 @P2 LD.E.CI.128 loadI0, [trackI + 4x<0>];
+-:-:-:-:00 @P2 LD.E.CI.128 loadI4, [trackI + 4x<8>];
+-:-:-:-:00 @P3 LD.E.CI.128 loadE0, [trackE + 4x<0>];
+-:-:-:-:00 @P3 LD.E.CI.128 loadE4, [trackE + 4x<8>];
+
+-:-:-:-:00 @P2 IADD   trackI0.CC, trackI0, 4x<16>;
+-:-:-:-:00 @P2 IADD.X trackI1, trackI1, RZ;
+-:-:-:-:00 @P3 IADD   trackE0.CC, trackE0, 4x<16>;
+-:-:-:-:00 @P3 IADD.X trackE1, trackE1, RZ;
+
+-:-:-:-:00 @P5 BRA.U NEXT_Q;
+-:-:-:-:00 @P6 BRA.U NEXT_P;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, q, param_Q, PT;
+-:-:-:-:00 ISETP.LT.AND P6, PT, p, param_P, PT;
+
+NEXT_PQ:
+
+-:-:-:-:00 LDS.128 j0Ex0, [readEs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS.128 j0Iy0, [readIs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS.128 j0Ex4, [readEs + 4x<0*128 + 64>];
+-:-:-:-:00 LDS.128 j0Iy4, [readIs + 4x<0*128 + 64>];
+
+// P0 loop N
+// P2 bounds I
+// P3 bounds E
+// P4 bounds yz
+// P5 loop Q
+// P6 loop P
+
+//loop = N >= 16 && (N >= 32 || (!p5 && !p6))
+
+NEXT_16N:
+
+<CODE>
+
+    my %insert =
+    (
+        j0c8   => "-:-:-:-:00 IADD loopN, loopN, -16;\n",
+        j0c14  => "-:-:-:-:00 ISETP.GE.AND P0, PT, loopN, 16, PT;\n",
+
+        j4c8   => "-:-:-:-:00 \@P0 STS [writeS + 4x< 0*128>], loadI0;\n",
+        j4c10  => "-:-:-:-:00 \@P0 STS [writeS + 4x< 1*128>], loadI1;\n",
+        j4c12  => "-:-:-:-:00 \@P0 STS [writeS + 4x< 2*128>], loadI2;\n",
+        j4c14  => "-:-:-:-:00 \@P0 STS [writeS + 4x< 3*128>], loadI3;\n",
+
+        j5c8   => "-:-:-:-:00 \@P0 STS [writeS + 4x< 8*128 + 16>], loadI4;\n",
+        j5c10  => "-:-:-:-:00 \@P0 STS [writeS + 4x< 9*128 + 16>], loadI5;\n",
+        j5c12  => "-:-:-:-:00 \@P0 STS [writeS + 4x<10*128 + 16>], loadI6;\n",
+        j5c14  => "-:-:-:-:00 \@P0 STS [writeS + 4x<11*128 + 16>], loadI7;\n",
+
+        j5c16  => "-:-:-:-:00 ISETP.GE.AND P2, PT, loopN, 32, P2;\n",
+
+        j5c60  => "-:-:-:-:00 \@P2 LD.E.CI.128 loadI0, [trackI + 4x<0>];\n",
+        j5c62  => "-:-:-:-:00 \@P2 LD.E.CI.128 loadI4, [trackI + 4x<8>];\n",
+
+        j6c16  => "-:-:-:-:00 \@!P2 LDS.128 loadI0, [RZ + addr_zero];\n",
+        j7c16  => "-:-:-:-:00 \@!P2 LDS.128 loadI4, [RZ + addr_zero];\n",
+
+        j10c57 => "-:-:-:-:00 \@P2 IADD   trackI0.CC, trackI0, 4x<16>;\n",
+        j10c62 => "-:-:-:-:00 \@P2 IADD.X trackI1,    trackI1, RZ;\n",
+
+        j12c8  => "-:-:-:-:00 \@P0 STS [writeS + 4x<0*128 + szBuf>], loadE0;\n",
+        j12c10 => "-:-:-:-:00 \@P0 STS [writeS + 4x<1*128 + szBuf>], loadE1;\n",
+        j12c12 => "-:-:-:-:00 \@P0 STS [writeS + 4x<2*128 + szBuf>], loadE2;\n",
+        j12c14 => "-:-:-:-:00 \@P0 STS [writeS + 4x<3*128 + szBuf>], loadE3;\n",
+
+        j13c8  => "-:-:-:-:00 \@P0 STS [writeS + 4x<8*128 + szBuf + 16>], loadE4;\n",
+        j13c10 => "-:-:-:-:00 \@P0 STS [writeS + 4x<9*128 + szBuf + 16>], loadE5;\n",
+        j13c12 => "-:-:-:-:00 \@P0 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6;\n",
+        j13c14 => "-:-:-:-:00 \@P0 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7;\n",
+
+        j13c16 => "-:-:-:-:00 ISETP.GE.AND P3, PT, loopN, 32, P3;\n",
+
+        j13c60 => "-:-:-:-:00 \@P3 LD.E.CI.128 loadE0, [trackE + 4x<0>];\n",
+        j13c62 => "-:-:-:-:00 \@P3 LD.E.CI.128 loadE4, [trackE + 4x<8>];\n",
+
+        j14c16 => "-:-:-:-:00 @!P3 LDS.128 loadE0, [RZ + addr_zero];\n",
+        j15c16 => "-:-:-:-:00 @!P3 LDS.128 loadE4, [RZ + addr_zero];\n",
+
+        j15c57 => "-:-:-:-:00 \@P3 IADD   trackE0.CC, trackE0, 4x<16>;\n",
+        j15c62 => "-:-:-:-:00 \@P3 IADD.X trackE1,    trackE1, RZ;\n",
+
+        j14c63 => "-:-:-:-:00 \@P0 BAR.SYNC 0;\n" .
+                  "-:-:-:-:00 \@P0 IADD readEs, readEs, -swapBuf;\n" .
+                  "-:-:-:-:00 \@P0 IADD readIs, readIs, -swapBuf;\n" .
+                  "-:-:-:-:00 \@P0 IADD writeS, writeS,  swapBuf;\n" .
+                  "-:-:-:-:00 \@P0 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j15c24 => "-:-:-:-:00 ISETP.GT.AND P1, PT, loopN, 32, PT;\n",
+        j15c37 => "-:-:-:-:00 PSETP.AND.OR P1, PT, !P5, !P6, P1;\n",
+        j15c50 => "-:-:-:-:00 PSETP.AND.AND P0, PT, P0, P1, PT;\n",
+
+        j15c63 => "-:-:-:-:00 \@P0 BRA.U NEXT_16N;\n" .
+                  "-:-:-:-:00 \@P5 BRA.U NEXT_Q;\n" .
+                  "-:-:-:-:00 \@P6 BRA.U NEXT_P;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 15)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = 1 - $odd;
+        my $rsOffset = ($j + 1) & 15;
+        my $rsPred   = $j == 15 ? '@P0' : '   ';
+        my $shift    = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2;
+        my $barrier  = $j == 14 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "-:-:-:-:00 %s LDS.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "-:-:-:-:00 %s LDS.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "-:-:-:-:00 %s LDS.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "-:-:-:-:00 %s LDS.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|F2F|LD|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "-:-:-:-:00";
+
+            $out .= sprintf "%s FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkI, SR_CTAID.Y;
+-:-:-:-:00 S2R blkE, SR_CTAID.Z;
+
+-:-:-:-:00 ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+-:-:-:-:00 IADD readEs,  readEs, -4x<szBuf>;
+-:-:-:-:00 @P0 IADD readIs,  readIs, -swapBuf;
+-:-:-:-:00 @P0 IADD readEs,  readEs, -swapBuf;
+
+// writeCs = (readIs / 4) * 128 + readEs;
+-:-:-:-:00 ISCADD  writeCs, readIs, readEs, 5;
+
+-:-:-:-:00 LOP.AND tid31,  tid,  31;
+-:-:-:-:00 LOP.AND tid96,  tid,  96;
+-:-:-:-:00 LOP.AND t128,   tid, 128;
+
+// kk = tid31 | (t128 >> 2);
+-:-:-:-:00 SHR.U32  kk, t128, 2;
+-:-:-:-:00 LOP.OR   kk, tid31,  kk;
+
+// readCs = ((tid96 << 4) | kk) << 2;
+-:-:-:-:00 SHL     readCs, tid96,  4;
+-:-:-:-:00 LOP.OR  readCs, readCs, kk;
+-:-:-:-:00 SHL     readCs, readCs, 2;
+
+// kk += blkE*128;
+-:-:-:-:00 ISCADD  kk, blkE, kk, 7;
+
+// crst = blkI*128 + (tid96 >> 1)
+-:-:-:-:00 SHR.U32 crst00, tid96, 1;
+-:-:-:-:00 ISCADD  crst00, blkI, crst00, 7;
+-:-:-:-:00 IADD    crst04, crst00,  4;
+-:-:-:-:00 IADD    crst08, crst00,  8;
+-:-:-:-:00 IADD    crst12, crst00,  12;
+
+-:-:-:-:00 MOV K, param_K;
+-:-:-:-:00 SHL K1, K, 2;
+-:-:-:-:00 SHL K4, K, 4;
+-:-:-:-:00 ISCADD K60, K, -K4, 8;
+
+// trackF += crst*K + k;
+-:-:-:-:00 IMAD tmp_param0, crst00, K, RZ;
+-:-:-:-:00 IADD tf, tmp_param0, kk;
+//-:-:-:-:00 LEA      track00F0.CC, tf, param_F[0],     0x2;
+//-:-:-:-:00 LEA.HI.X track00F1,    tf, param_F[1], RZ, 0x2;
+-:-:-:-:00 MOV tmp_param0, param_F[0];
+-:-:-:-:00 MOV tmp_param1, param_F[1];
+-:-:-:-:00 SHL tmp_shl, tf, 0x2;
+-:-:-:-:00 IADD track00F0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X track00F1, RZ, tmp_param1;
+
+// kk < K
+-:-:-:-:00 ISETP.LT.AND P5, PT, kk, param_K, PT;
+-:-:-:-:00 IADD kk, kk, 64;
+-:-:-:-:00 ISETP.LT.AND P6, PT, kk, param_K, PT;
+
+-:-:-:-:00 MOV alpha, param_alpha;
+
+-:-:-:-:00 IADD   track04F0.CC, track00F0, K4;
+-:-:-:-:00 IADD.X track04F1,    track00F1, RZ;
+-:-:-:-:00 IADD   track08F0.CC, track04F0, K4;
+-:-:-:-:00 IADD.X track08F1,    track04F1, RZ;
+-:-:-:-:00 IADD   track12F0.CC, track08F0, K4;
+-:-:-:-:00 IADD.X track12F1,    track08F1, RZ;
+
+-:-:-:-:00 BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "-:-:-:-:00 IADD   track00F0.CC, track00F0, K60;\n" .
+            "-:-:-:-:00 IADD   crst00,       crst00,     60;\n" .
+            "-:-:-:-:00 IADD.X track00F1,    track00F1,  RZ;\n" .
+            "-:-:-:-:00 IADD   track04F0.CC, track04F0, K60;\n" .
+            "-:-:-:-:00 IADD   crst04,       crst04,     60;\n" .
+            "-:-:-:-:00 IADD.X track04F1,    track04F1,  RZ;\n" .
+            "-:-:-:-:00 IADD   track08F0.CC, track08F0, K60;\n" .
+            "-:-:-:-:00 IADD   crst08,       crst08,     60;\n" .
+            "-:-:-:-:00 IADD.X track08F1,    track08F1,  RZ;\n" .
+            "-:-:-:-:00 IADD   track12F0.CC, track12F0, K60;\n" .
+            "-:-:-:-:00 IADD   crst12,       crst12,     60;\n" .
+            "-:-:-:-:00 IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "-:-:-:-:00 FMUL c0, cx0y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c1, cx1y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c2, cx2y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c3, cx3y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c4, cx4y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c5, cx5y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c6, cx6y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "-:-:-:-:00 CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 EXIT;
+
+STORE_C:
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
+-:-:-:-:00 IADD         crst00, crst00, 1;
+-:-:-:-:00 ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
+-:-:-:-:00 IADD         crst04, crst04, 1;
+-:-:-:-:00 ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
+-:-:-:-:00 IADD         crst08, crst08, 1;
+-:-:-:-:00 ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
+-:-:-:-:00 IADD         crst12, crst12, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+-:-:-:-:00 STS.128 [writeCs+4x<00>], c0;
+-:-:-:-:00 STS.128 [writeCs+4x<64>], c4;
+
+-:-:-:-:00 LDS c0, [readCs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS c2, [readCs + 4x<1*128 + 00>];
+-:-:-:-:00 LDS c4, [readCs + 4x<2*128 + 00>];
+-:-:-:-:00 LDS c6, [readCs + 4x<3*128 + 00>];
+
+-:-:-:-:00 @P0 RED.E.ADD.F32.FTZ.RN [track00F], c0;
+-:-:-:-:00 PSETP.AND.AND P0, PT, P0, P6, PT;
+-:-:-:-:00 @P1 RED.E.ADD.F32.FTZ.RN [track04F], c2;
+-:-:-:-:00 PSETP.AND.AND P1, PT, P1, P6, PT;
+-:-:-:-:00 @P2 RED.E.ADD.F32.FTZ.RN [track08F], c4;
+-:-:-:-:00 PSETP.AND.AND P2, PT, P2, P6, PT;
+-:-:-:-:00 @P3 RED.E.ADD.F32.FTZ.RN [track12F], c6;
+-:-:-:-:00 PSETP.AND.AND P3, PT, P3, P6, PT;
+
+-:-:-:-:00 LDS c1, [readCs + 4x<0*128 + 64>];
+-:-:-:-:00 LDS c3, [readCs + 4x<1*128 + 64>];
+-:-:-:-:00 LDS c5, [readCs + 4x<2*128 + 64>];
+-:-:-:-:00 LDS c7, [readCs + 4x<3*128 + 64>];
+
+-:-:-:-:00 @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], c1;
+-:-:-:-:00 @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], c3;
+-:-:-:-:00 @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], c5;
+-:-:-:-:00 @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], c7;
+
+-:-:-:-:00 IADD   track00F0.CC, track00F0, K1;
+-:-:-:-:00 IADD.X track00F1,    track00F1, RZ;
+-:-:-:-:00 IADD   track04F0.CC, track04F0, K1;
+-:-:-:-:00 IADD.X track04F1,    track04F1, RZ;
+-:-:-:-:00 IADD   track08F0.CC, track08F0, K1;
+-:-:-:-:00 IADD.X track08F1,    track08F1, RZ;
+-:-:-:-:00 IADD   track12F0.CC, track12F0, K1;
+-:-:-:-:00 IADD.X track12F1,    track12F1, RZ;
+
+-:-:-:-:00 RET;
diff --git a/Kernel/Convolution/Maxwell/hconv_bprop_C1_N64.sass b/Kernel/Convolution/Maxwell/hconv_bprop_C1_N64.sass
new file mode 100644
index 0000000..fb00d82
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/hconv_bprop_C1_N64.sass
@@ -0,0 +1,663 @@
+# Kernel: hconv_bprop_C32_N64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert { return $convert; }
+
+    our $dtype = $int16 ? 'S16' : 'U16';
+    sub dtype { return $dtype; }
+-]
+
+<CONSTANT_MAPPING>
+    addr_lut : 4x<64*4>
+
+    param_I[0]         : c[0x0][0x140]
+    param_I[1]         : c[0x0][0x144]
+    param_E[0]         : c[0x0][0x148]
+    param_E[1]         : c[0x0][0x14c]
+    param_F[0]         : c[0x0][0x150]
+    param_F[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_N            : c[0x0][0x15c]
+    param_K            : c[0x0][0x160]
+    param_D            : c[0x0][0x164]
+    param_H            : c[0x0][0x168]
+    param_W            : c[0x0][0x16c]
+    param_WN           : c[0x0][0x170]
+    param_HWN          : c[0x0][0x174]
+    param_DHWN         : c[0x0][0x178]
+    param_C            : c[0x0][0x17c]
+    param_CRST         : c[0x0][0x180]
+    param_RST          : c[0x0][0x184]
+    param_magic_RST    : c[0x0][0x188]
+    param_shift_RST    : c[0x0][0x18c]
+    param_RS           : c[0x0][0x190]
+    param_magic_RS     : c[0x0][0x194]
+    param_shift_RS     : c[0x0][0x198]
+    param_S            : c[0x0][0x19c]
+    param_magic_S      : c[0x0][0x1a0]
+    param_shift_S      : c[0x0][0x1a4]
+    param_pad_d        : c[0x0][0x1a8]
+    param_pad_h        : c[0x0][0x1ac]
+    param_pad_w        : c[0x0][0x1b0]
+    param_str_d        : c[0x0][0x1b4]
+    param_str_h        : c[0x0][0x1b8]
+    param_str_w        : c[0x0][0x1bc]
+    param_Q            : c[0x0][0x1c0]
+    param_PQ           : c[0x0][0x1c4]
+    param_QN           : c[0x0][0x1c8]
+    param_PQN          : c[0x0][0x1cc]
+    param_MPQN         : c[0x0][0x1d0]
+    param_magic_Q      : c[0x0][0x1d4]
+    param_shift_Q      : c[0x0][0x1d8]
+    param_magic_PQ     : c[0x0][0x1dc]
+    param_shift_PQ     : c[0x0][0x1e0]
+    param_CRST8        : c[0x0][0x1e4]
+    param_MPQN8        : c[0x0][0x1e8]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-67 ~ tid, blkE, blkF, blkMPQ
+
+     68-119 ~ k<0|4>, tidFX, tidEX, tid1, tid7, m, p, q, crst, n, tf<0|4>, te, te<0|4>, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+      64-79 : j0Ex<0-7>, j0Fy<0-7>
+      80-95 : j1Ex<0-7>, j1Fy<0-7>
+
+     96-103 : load0F<0-3>, load4F<0-3>
+     96-103 : store0F<0-3>, store4F<0-3>
+
+    104-107 : load0E<0-3>
+    104-107 : store0E<0-3>
+    112-115 : store0E<4-7>
+
+    108-111 : load4E<0-3>
+    108-111 : store4E<0-3>
+    112-115 : store4E<4-7>
+
+    116-119 : track0F<0-1>, track4F<0-1>
+    120-123 : track0E<0-1>, track4E<0-1>
+
+    124-127 ~ writeEs, writeFs, swapBuf, K
+    128-132 ~ readEs, readFs, mt, pr, qs
+
+     68-71  ~ lutStore, sliceI
+     72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD
+
+     72-93  : c<0-7>, cs<0-3>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1>
+     94-127 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,    SR_TID.X;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkF,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+// tidFX  = (tid & 7) << 2
+// tidEX  = (tid & 7) << 3
+// k      = tid >> 3
+01:-:-:-:1      LOP.AND tid7,  tid,  7;
+--:-:-:-:1      SHL     tidFX, tid7, 2;
+--:-:-:-:1      SHL     tidEX, tid7, 3;
+--:-:-:-:1      SHR.U32 k0,    tid,  3;
+--:-:-:-:1      IADD    k4,    k0,   4;
+
+--:-:-:-:1      MOV K, param_K;
+
+--:-:-:-:1      STS.128 [RZ], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [RZ];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+
+// mt = m * w - pad_d
+// pr = p * u - pad_h
+// qs = q * v - pad_w
+--:-:-:-:1      XMAD mt, m,   param_str_d, RZ;
+--:-:-:-:1      XMAD pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD qs, q,   param_str_w, RZ;
+--:-:-:-:1      IADD mt, mt, -param_pad_d;
+--:-:-:-:1      IADD pr, pr, -param_pad_h;
+--:-:-:-:1      IADD qs, qs, -param_pad_w;
+
+// crst = blkF*32 + tidX
+// n    = blkE*64 + tidX
+04:-:-:-:1      ISCADD crst, blkF, tidFX, 5;
+08:-:-:-:1      ISCADD n,    blkE, tidEX, 6;
+
+// trackF = k*CRST + crst
+--:-:-:-:1      XMAD     tf0, k0, param_CRST, crst;
+--:-:-:-:1      XMAD     tf4, k4, param_CRST, crst;
+--:-:-:-:1      LEA      track0F0.CC, tf0, param_F[0],     1;
+--:-:-:-:1      LEA.HI.X track0F1,    tf0, param_F[1], RZ, 1;
+--:-:-:-:1      LEA      track4F0.CC, tf4, param_F[0],     1;
+--:-:-:-:1      LEA.HI.X track4F1,    tf4, param_F[1], RZ, 1;
+
+// trackE = k*MPQN + m*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD      te,  q,  param_N,    n;
+--:-:-:-:1      XMAD.LO2C te,  p,  param_QN,   te;
+--:-:-:-:1      XMAD.LO2C te,  m,  param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te0, k0, param_MPQN, te;
+--:-:-:-:1      XMAD.LO2C te4, k4, param_MPQN, te;
+--:-:-:-:1      LEA       track0E0.CC, te0, param_E[0],     1;
+--:-:-:-:1      LEA.HI.X  track0E1,    te0, param_E[1], RZ, 1;
+--:-:-:-:1      LEA       track4E0.CC, te4, param_E[0],     1;
+--:-:-:-:1      LEA.HI.X  track4E1,    te4, param_E[1], RZ, 1;
+
+// P1 = crst < CRST
+// P2 = n    < N
+// P3 = n+32 < N
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst, param_CRST, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, n,    param_N,    PT;
+
+// Remap the EX dim to avoid bank conflicts when storing to shared
+// We can unmap this in the output
+
+// writeFs = (32*k + tidFX) * 4
+--:-:-:-:1      ISCADD  writeFs, k0, tidFX, 5;
+--:-:-:-:1      SHL     writeFs, writeFs,   2;
+// writeEs = (64*k + tidFX) * 4 (tidFX here not a bug)
+--:-:-:-:1      ISCADD  writeEs, k0, tidFX, 6;
+--:-:-:-:1      ISCADD  writeEs, writeEs, 4x<32*8>, 2;
+
+// readFs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readEs = ((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<32*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<32*8 + 64*8>;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD K, K, -8;
+
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F0, [track0F + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F1, [track0F + 2x<1>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F2, [track0F + 2x<2>];
+--:-:1:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F3, [track0F + 2x<3>];
+
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F0, [track4F + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F1, [track4F + 2x<1>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F2, [track4F + 2x<2>];
+--:-:2:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F3, [track4F + 2x<3>];
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, K, RZ, P1;
+
+--:-:3:-:1  @P2 LDG.E.128 load0E0, [track0E];
+--:-:4:-:1  @P2 LDG.E.128 load4E0, [track4E];
+
+--:-:-:-:0      ISETP.GT.AND P2, PT, K, RZ, P2;
+
+01:-:-:-:1      [+ convert() +] store0F0, load0F0;
+--:-:-:-:1      [+ convert() +] store0F1, load0F1;
+--:-:-:-:1      [+ convert() +] store0F2, load0F2;
+--:-:1:-:1      [+ convert() +] store0F3, load0F3;
+--:-:-:-:6      IADD   track0F0.CC, track0F0, param_CRST8;
+--:-:-:-:0      IADD.X track0F1,    track0F1, RZ;
+01:-:-:-:1      STS.128 [writeFs + 4x<0*32>], store0F;
+
+02:-:-:-:1      [+ convert() +] store4F0, load4F0;
+--:-:-:-:1      [+ convert() +] store4F1, load4F1;
+--:-:-:-:1      [+ convert() +] store4F2, load4F2;
+--:-:2:-:1      [+ convert() +] store4F3, load4F3;
+--:-:-:-:6      IADD   track4F0.CC, track4F0, param_CRST8;
+--:-:-:-:0      IADD.X track4F1,    track4F1, RZ;
+02:-:-:-:1      STS.128 [writeFs + 4x<4*32>], store4F;
+
+04:-:-:-:1      [+ convert() +] store0E7, load0E3.H1;
+--:-:-:-:1      [+ convert() +] store0E6, load0E3.H0;
+--:-:-:-:1      [+ convert() +] store0E5, load0E2.H1;
+--:-:1:-:1      [+ convert() +] store0E4, load0E2.H0;
+--:-:-:-:1      [+ convert() +] store0E3, load0E1.H1;
+--:-:-:-:1      [+ convert() +] store0E2, load0E1.H0;
+--:-:-:-:1      [+ convert() +] store0E1, load0E0.H1;
+--:-:2:-:1      [+ convert() +] store0E0, load0E0.H0;
+--:-:-:-:6      IADD   track0E0.CC, track0E0, param_MPQN8;
+--:-:-:-:0      IADD.X track0E1,    track0E1, RZ;
+01:-:-:-:1      STS.128 [writeEs + 4x<0*64 + 32>], store0E4;
+02:1:-:-:2      STS.128 [writeEs + 4x<0*64 +  0>], store0E0;
+
+09:-:-:-:1      [+ convert() +] store4E7, load4E3.H1;
+--:-:-:-:1      [+ convert() +] store4E6, load4E3.H0;
+--:-:-:-:1      [+ convert() +] store4E5, load4E2.H1;
+--:-:1:-:1      [+ convert() +] store4E4, load4E2.H0;
+--:-:-:-:1      [+ convert() +] store4E3, load4E1.H1;
+--:-:-:-:1      [+ convert() +] store4E2, load4E1.H0;
+--:-:-:-:1      [+ convert() +] store4E1, load4E0.H1;
+--:-:2:-:1      [+ convert() +] store4E0, load4E0.H0;
+--:-:-:-:6      IADD   track4E0.CC, track4E0, param_MPQN8;
+--:-:-:-:0      IADD.X track4E1,    track4E1, RZ;
+01:-:-:-:1      STS.128 [writeEs + 4x<4*64 + 32>], store4E4;
+02:1:-:-:2      STS.128 [writeEs + 4x<4*64 +  0>], store4E0;
+
+
+01:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
+--:-:-:-:2      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD K, K, -8;
+
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*32 + 16>];
+
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F0, [track0F + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F1, [track0F + 2x<1>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F2, [track0F + 2x<2>];
+--:-:2:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F3, [track0F + 2x<3>];
+
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F0, [track4F + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F1, [track4F + 2x<1>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F2, [track4F + 2x<2>];
+--:-:3:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F3, [track4F + 2x<3>];
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, K, RZ, P1;
+
+--:-:4:-:1  @P2 LDG.E.128 load0E0, [track0E];
+--:-:5:-:1  @P2 LDG.E.128 load4E0, [track4E];
+
+--:-:-:-:2      ISETP.GT.AND P2, PT, K, RZ, P2;
+
+NEXT_8K:
+--:-:-:-:1      ISETP.GT.AND P0, PT, K, -8, PT;
+
+[+
+    our $convert;
+    our $dtype;
+    my %insert =
+    (
+        j0c8  => "--:-:-:-:1      IADD K, K, -8;\n",
+
+        j0c12 => "02:-:-:-:1  \@P0 $convert store0F0, load0F0;\n",
+        j0c16 => "--:-:-:-:1  \@P0 $convert store0F1, load0F1;\n",
+        j0c20 => "--:-:-:-:1  \@P0 $convert store0F2, load0F2;\n",
+        j0c24 => "--:-:2:-:1  \@P0 $convert store0F3, load0F3;\n",
+        j0c26 => "--:-:-:-:1  \@P0 IADD   track0F0.CC, track0F0, param_CRST8;\n",
+        j0c31 => "--:-:-:-:1  \@P0 IADD.X track0F1,    track0F1, RZ;\n",
+        j0c38 => "02:2:-:-:1  \@P0 STS.128 [writeFs + 4x<0*32>], store0F;\n",
+        j1c8  => "02:-:-:-:1  \@P1 LDG.E.CI.$dtype load0F0, [track0F + 2x<0>];\n",
+        j1c10 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype load0F1, [track0F + 2x<1>];\n",
+        j1c12 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype load0F2, [track0F + 2x<2>];\n",
+        j1c14 => "--:-:2:-:1  \@P1 LDG.E.CI.$dtype load0F3, [track0F + 2x<3>];\n",
+
+        j2c12 => "04:-:-:-:1  \@P0 $convert store4F0, load4F0;\n",
+        j2c16 => "--:-:-:-:1  \@P0 $convert store4F1, load4F1;\n",
+        j2c20 => "--:-:-:-:1  \@P0 $convert store4F2, load4F2;\n",
+        j2c24 => "--:-:3:-:1  \@P0 $convert store4F3, load4F3;\n",
+        j2c26 => "--:-:-:-:1  \@P0 IADD   track4F0.CC, track4F0, param_CRST8;\n",
+        j2c31 => "--:-:-:-:1  \@P0 IADD.X track4F1,    track4F1, RZ;\n",
+        j2c38 => "04:3:-:-:1  \@P0 STS.128 [writeFs + 4x<4*32>], store4F;\n",
+        j3c8  => "04:-:-:-:1  \@P1 LDG.E.CI.$dtype load4F0, [track4F + 2x<0>];\n",
+        j3c10 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype load4F1, [track4F + 2x<1>];\n",
+        j3c12 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype load4F2, [track4F + 2x<2>];\n",
+        j3c14 => "--:-:3:-:1  \@P1 LDG.E.CI.$dtype load4F3, [track4F + 2x<3>];\n",
+
+        j4c12 => "08:-:-:-:1  \@P0 $convert store0E7, load0E3.H1;\n",
+        j4c16 => "--:-:-:-:1  \@P0 $convert store0E6, load0E3.H0;\n",
+        j4c20 => "--:-:-:-:1  \@P0 $convert store0E5, load0E2.H1;\n",
+        j4c24 => "--:-:6:-:1  \@P0 $convert store0E4, load0E2.H0;\n",
+        j4c28 => "--:-:-:-:1  \@P0 $convert store0E3, load0E1.H1;\n",
+        j4c32 => "--:-:-:-:1  \@P0 $convert store0E2, load0E1.H0;\n",
+        j4c36 => "--:-:-:-:1  \@P0 $convert store0E1, load0E0.H1;\n",
+        j4c40 => "--:-:4:-:1  \@P0 $convert store0E0, load0E0.H0;\n",
+        j4c42 => "20:-:-:-:1  \@P0 STS.128 [writeEs + 4x<0*64 + 32>], store0E4;\n",
+        j4c44 => "--:-:-:-:1  \@P0 IADD   track0E0.CC, track0E0, param_MPQN8;\n",
+        j4c49 => "--:-:-:-:1  \@P0 IADD.X track0E1,    track0E1, RZ;\n",
+        j4c56 => "08:4:-:-:1  \@P0 STS.128 [writeEs + 4x<0*64 +  0>], store0E0;\n",
+        j5c8  => "08:-:4:-:1  \@P2 LDG.E.128 load0E0, [track0E];\n",
+
+        j5c12 => "10:-:-:-:1  \@P0 $convert store4E7, load4E3.H1;\n",
+        j5c16 => "--:-:-:-:1  \@P0 $convert store4E6, load4E3.H0;\n",
+        j5c20 => "--:-:-:-:1  \@P0 $convert store4E5, load4E2.H1;\n",
+        j5c24 => "--:-:6:-:1  \@P0 $convert store4E4, load4E2.H0;\n",
+        j5c28 => "--:-:-:-:1  \@P0 $convert store4E3, load4E1.H1;\n",
+        j5c32 => "--:-:-:-:1  \@P0 $convert store4E2, load4E1.H0;\n",
+        j5c36 => "--:-:-:-:1  \@P0 $convert store4E1, load4E0.H1;\n",
+        j5c40 => "--:-:5:-:1  \@P0 $convert store4E0, load4E0.H0;\n",
+        j5c42 => "20:-:-:-:1  \@P0 STS.128 [writeEs + 4x<4*64 + 32>], store4E4;\n",
+        j5c44 => "--:-:-:-:1  \@P0 IADD   track4E0.CC, track4E0, param_MPQN8;\n",
+        j5c49 => "--:-:-:-:1  \@P0 IADD.X track4E1,    track4E1, RZ;\n",
+        j5c56 => "10:5:-:-:1  \@P0 STS.128 [writeEs + 4x<4*64 +  0>], store4E0;\n",
+        j6c8  => "10:-:5:-:1  \@P2 LDG.E.128 load4E0, [track4E];\n",
+
+        j6c63 => "20:-:-:-:1  \@P0 IADD readEs,  readEs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeEs, writeEs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c8  => "--:-:-:-:1      ISETP.GT.AND P1, PT, K, RZ, P1;\n",
+        j7c10 => "--:-:-:-:1      ISETP.GT.AND P2, PT, K, RZ, PT;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_8K;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+        my $barrier  = $j == 6 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $barrier, $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2F|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+--:-:-:-:0      MOV warp_cnt, 32;
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkF, SR_CTAID.Y;
+--:-:3:-:1      S2R blkE, SR_CTAID.Z;
+01:-:-:-:6      MOV rst,  tid;
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_cnt < RST (c=0)
+--:-:-:-:1      ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT;
+--:-:-:-:1      IADD warp_cnt, warp_cnt, 32;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = qs + s
+// y = pr + r
+// z = mt + t
+--:-:-:-:1      IADD z, mt, t;
+--:-:-:-:1      IADD y, pr, r;
+--:-:-:-:1      IADD x, qs, s;
+// i = (z*HWN + y*WN + x*N) * 4
+20:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD      sliceI, x, param_N,   sliceI;
+--:-:-:-:1      SHL       sliceI, sliceI, 1;
+// Bounds check x and y, and make i negative if outside
+--:-:-:-:1      ISET.LT.AND x0, x, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW, x,  param_W, PT;
+--:-:-:-:1      ISET.LT.AND y0, y, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH, y,  param_H, PT;
+--:-:-:-:1      ISET.LT.AND z0, z, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD, z,  param_D, PT;
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, x0, xW, 0xfe;
+<ORDERED>
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, y0, yH, 0xfe;
+--:-:-:-:1      SHL lutStore, rst, 2;
+--:-:-:-:1      IADD rst, rst, 32;
+</ORDERED>
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, z0, zD, 0xfe;
+// Store i imgOffset into the shared lookup table
+--:6:-:-:1      STS [lutStore + addr_lut], sliceI;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U LUT_LOOP;
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV RST,       param_RST;
+--:-:-:-:1      MOV DHWN1,     param_DHWN;
+--:-:-:-:1      SHL DHWN1,     DHWN1, 1;
+
+--:-:-:-:1      LOP.AND readEs, readEs, 0x7f;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x3f;
+
+// Expand back out to undo our bank conflict avoiding stride
+--:-:-:-:1      SHL readEs, readEs, 1;
+
+// writeCs = ((readIs / 4) * 64 + readEs) / 2;
+--:-:-:-:1      ISCADD  writeCs, readFs, readEs, 4;
+--:-:-:-:1      SHR.U32 writeCs, writeCs, 1;
+
+// readCs = (tid & 31) << 2;
+--:-:-:-:1      LOP.AND tid31,  tid,   31;
+--:-:-:-:1      SHL     readCs, tid31, 2;
+
+// nn = blkE*64 + tid31 << 1;
+--:-:-:-:1      SHL tid31, tid31, 1;
+04:-:-:-:1      ISCADD nn, blkE, tid31, 6;
+
+// crst = blkF*32
+02:-:-:-:1      SHL  crst00, blkF,   5;
+--:-:-:-:1      IADD crst04, crst00, 4;
+--:-:-:-:1      IADD crst08, crst00, 8;
+--:-:-:-:1      IADD crst12, crst00, 12;
+
+--:-:-:-:1      LEA      trackI0.CC, nn, param_I[0],     1;
+--:-:-:-:1      LEA.HI.X trackI1,    nn, param_I[1], RZ, 1;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P5, PT, nn, param_N, PT;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:1      IADD crst00, crst00, 12;\n" .
+            "--:-:-:-:1      IADD crst04, crst04, 12;\n" .
+            "--:-:-:-:1      IADD crst08, crst08, 12;\n" .
+            "--:-:-:-:1      IADD crst12, crst12, 12;\n" if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+// Round nearest
+--:-:-:-:1      F2F.F16.F32 c0, c0;
+--:-:1:-:1      F2F.F16.F32 c1, c1;
+--:-:-:-:1      F2F.F16.F32 c2, c2;
+--:-:2:-:1      F2F.F16.F32 c3, c3;
+--:-:-:-:1      F2F.F16.F32 c4, c4;
+--:-:3:-:1      F2F.F16.F32 c5, c5;
+--:-:-:-:1      F2F.F16.F32 c6, c6;
+--:-:4:-:1      F2F.F16.F32 c7, c7;
+
+// Pack 2 16 bit values into 32 bit words
+11:-:-:-:2      BFI cs0, c1, 0x1010, c0;
+02:-:-:-:2      BFI cs1, c3, 0x1010, c2;
+24:-:-:-:2      BFI cs2, c5, 0x1010, c4;
+08:-:-:-:0      BFI cs3, c7, 0x1010, c6;
+
+// Undo the stride in the X dim (items spaced by 32 are actually spaced 4)
+--:-:-:-:4      STS.64 [writeCs+2x<0>], cs0;
+--:-:-:-:1      STS.64 [writeCs+2x<4>], cs2;
+--:-:-:-:1      LDS cs0, [readCs + 2x<0*64>];
+--:-:-:-:1      LDS cs1, [readCs + 2x<1*64>];
+--:-:-:-:1      LDS cs2, [readCs + 2x<2*64>];
+--:-:-:-:1      LDS cs3, [readCs + 2x<3*64>];
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5;
+
+--:-:-:-:1      XMAD.LO2C c00, crst00, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c04, crst04, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c08, crst08, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c12, crst12, param_magic_RST, RZ;
+
+--:-:-:-:1      SHR.U32 c00, c00, param_shift_RST;
+--:-:-:-:1      SHR.U32 c04, c04, param_shift_RST;
+--:-:-:-:1      SHR.U32 c08, c08, param_shift_RST;
+--:-:-:-:1      SHR.U32 c12, c12, param_shift_RST;
+
+--:-:-:-:1      VMAD.U16.U16 lut00, -c00, RST, crst00;
+--:-:-:-:1      VMAD.U16.U16 lut04, -c04, RST, crst04;
+--:-:-:-:1      VMAD.U16.U16 lut08, -c08, RST, crst08;
+--:-:-:-:1      VMAD.U16.U16 lut12, -c12, RST, crst12;
+
+--:-:-:-:1      SHL lut00, lut00, 2;
+--:-:-:-:1      SHL lut04, lut04, 2;
+--:-:-:-:1      SHL lut08, lut08, 2;
+--:-:-:-:1      SHL lut12, lut12, 2;
+
+--:-:-:-:1      XMAD.LO2 chan00, DHWN1, c00, RZ;
+--:-:-:-:1      XMAD.LO2 chan04, DHWN1, c04, RZ;
+--:-:-:-:1      XMAD.LO2 chan08, DHWN1, c08, RZ;
+--:-:-:-:1      XMAD.LO2 chan12, DHWN1, c12, RZ;
+
+--:-:-:-:1      IADD crst00, crst00, 1;
+--:-:-:-:1      IADD crst04, crst04, 1;
+--:-:-:-:1      IADD crst08, crst08, 1;
+--:-:-:-:1      IADD crst12, crst12, 1;
+
+--:-:1:-:1  @P0 LDS img00, [lut00 + addr_lut];
+--:-:2:-:1  @P1 LDS img04, [lut04 + addr_lut];
+--:-:3:-:1  @P2 LDS img08, [lut08 + addr_lut];
+--:-:4:-:1  @P3 LDS img12, [lut12 + addr_lut];
+
+</SCHEDULE_BLOCK>
+
+01:-:-:-:1      IADD3  track00I0.CC, trackI0, img00, chan00;
+--:-:-:-:5      ISETP.GE.AND P0, PT, img00, RZ, P0;
+--:-:-:-:1      IADD.X track00I1,    trackI1, RZ;
+
+02:-:-:-:1      IADD3  track04I0.CC, trackI0, img04, chan04;
+--:-:-:-:5      ISETP.GE.AND P1, PT, img04, RZ, P1;
+--:-:-:-:1      IADD.X track04I1,    trackI1, RZ;
+
+04:-:-:-:1      IADD3  track08I0.CC, trackI0, img08, chan08;
+--:-:-:-:5      ISETP.GE.AND P2, PT, img08, RZ, P2;
+--:-:-:-:1      IADD.X track08I1,    trackI1, RZ;
+
+08:-:-:-:1      IADD3  track12I0.CC, trackI0, img12, chan12;
+--:-:-:-:5      ISETP.GE.AND P3, PT, img12, RZ, P3;
+--:-:-:-:0      IADD.X track12I1,    trackI1, RZ;
+
+--:-:-:-:2  @P0 RED.E.ADD.F16x2.FTZ.RN [track00I], cs0;
+--:5:-:-:2  @P1 RED.E.ADD.F16x2.FTZ.RN [track04I], cs1;
+--:-:-:-:4  @P2 RED.E.ADD.F16x2.FTZ.RN [track08I], cs2;
+--:6:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [track12I], cs3;
+
+--:-:-:-:5      RET;
+
diff --git a/Kernel/Convolution/Maxwell/hconv_updat_C128_K128.sass b/Kernel/Convolution/Maxwell/hconv_updat_C128_K128.sass
new file mode 100644
index 0000000..d6c9c15
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/hconv_updat_C128_K128.sass
@@ -0,0 +1,775 @@
+# Kernel: hconv_updat_C128_K128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(128*16 + 32)*4>
+    addr_blkIE : 4x<(128*16 + 32)*4 + 4>
+    addr_q     : 4x<(128*16 + 32)*4 + 6>
+    szBuf      : (128*16 + 32)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_offset_K     : c[0x0][0x15c]
+    param_N            : c[0x0][0x160]
+    param_K            : c[0x0][0x164]
+    param_D            : c[0x0][0x168]
+    param_H            : c[0x0][0x16c]
+    param_W            : c[0x0][0x170]
+    param_WN           : c[0x0][0x174]
+    param_HWN          : c[0x0][0x178]
+    param_DHWN         : c[0x0][0x17c]
+    param_C            : c[0x0][0x180]
+    param_CRST         : c[0x0][0x184]
+    param_RST          : c[0x0][0x188]
+    param_magic_RST    : c[0x0][0x18c]
+    param_shift_RST    : c[0x0][0x190]
+    param_RS           : c[0x0][0x194]
+    param_magic_RS     : c[0x0][0x198]
+    param_shift_RS     : c[0x0][0x19c]
+    param_S            : c[0x0][0x1a0]
+    param_magic_S      : c[0x0][0x1a4]
+    param_shift_S      : c[0x0][0x1a8]
+    param_pad_d        : c[0x0][0x1ac]
+    param_pad_h        : c[0x0][0x1b0]
+    param_pad_w        : c[0x0][0x1b4]
+    param_str_d        : c[0x0][0x1b8]
+    param_str_h        : c[0x0][0x1bc]
+    param_str_w        : c[0x0][0x1c0]
+    param_dil_d        : c[0x0][0x1c4]
+    param_dil_h        : c[0x0][0x1c8]
+    param_dil_w        : c[0x0][0x1cc]
+    param_P            : c[0x0][0x1d0]
+    param_Q            : c[0x0][0x1d4]
+    param_PQ           : c[0x0][0x1d8]
+    param_QN           : c[0x0][0x1dc]
+    param_PQN          : c[0x0][0x1e0]
+    param_MPQN         : c[0x0][0x1e4]
+    param_magic_Q      : c[0x0][0x1e8]
+    param_shift_Q      : c[0x0][0x1ec]
+    param_magic_PQ     : c[0x0][0x1f0]
+    param_shift_PQ     : c[0x0][0x1f4]
+    param_grid_P       : c[0x0][0x1f8]
+    param_grid_Q       : c[0x0][0x1fc]
+    param_grid_PQ      : c[0x0][0x200]
+    param_CRSTK        : c[0x0][0x204]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-65   : one
+    64-65   : blkIE<0-1>
+    64-68   : blkI, blkE, tid, tidX, tidY
+    69-95   ~ blkMPQ, tid1, tid7, tid128, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+    69-95   ~ c, z, y, x, k, te, mt, pr, qs, r, s, t, rs, rst, crst, ti, xw, xW, yh, yH, zd, zD, cC, nextP, nextQ, Q
+
+    64-79   : j0Ex<0-7>, j0Iy<0-7>
+    80-95   : j1Ex<0-7>, j1Iy<0-7>
+
+     96-99  : loadI<0-3>
+     96-99  : storeI<0-3>
+    100-103 : loadI<4-7>
+    112-115 : storeI<4-7>
+
+    104-107 : loadE<0-3>
+    104-107 : storeE<0-3>
+    108-111 : loadE<4-7>
+    112-115 : storeE<4-7>
+
+    116-119 : trackI<0-1>, trackE<0-1>
+
+    120-124 ~ writeS, loopN, m, p, q
+    125-127 ~ readIs, readEs, swapBuf
+
+     72-87  : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+    88-124  ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128, blk_MPQ, CRSTK, xmad_determ
+
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID1;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkI,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+--:-:-:-:5      BRA.U END_CTAID1;
+CTAID1:
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.Z;
+--:-:3:-:1      S2R blkI,   SR_CTAID.X;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Y;
+END_CTAID1:
+
+<SCHEDULE_BLOCK>
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 3
+// shiftX = (tid & 1) << 4
+01:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tid1, 3;
+--:-:-:-:1      SHL     shiftX, tid1, 4;
+
+0c:-:-:-:1      STS.64 [addr_blkIE], blkIE;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
++]
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_grid_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_grid_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+
+--:-:-:-:1      STS [addr_q], q;
+
+// writeS = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeS, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeS, writeS, shiftX;
+--:-:-:-:1      ISCADD writeS, writeS, 4x<szBuf * 2>, 2;
+
+// readIs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readIs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf;
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readEs, tid128, 4;
+--:-:-:-:1      LOP.OR  readEs, readEs, tid7;
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<szBuf>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szBuf * 2>;
+
+--:-:-:-:1      MOV loopN, RZ;
+
+// Flag for first load branch
+--:-:-:-:1      PSETP.AND.AND P0, PT, PT, PT, PT;
+
+</SCHEDULE_BLOCK>
+
+NEXT_PQ:
+
+--:-:2:-:1      S2R tid, SR_TID.X;
+--:-:3:-:1      LDS.U.64 blkIE, [addr_blkIE];
+
+<SCHEDULE_BLOCK>
+// Zigzag q but only if grid_P < P
+--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
+--:-:-:-:1      MOV Q, param_grid_P;
+--:-:-:-:1      ISETP.LT.AND P1, PT, Q, param_P, P1;
+--:-:-:-:1      MOV Q, -1;
+--:-:-:-:1  @P1 IADD3 Q, -q, param_Q, Q;
+--:-:-:-:1 @!P1 MOV Q, q;
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 3
+--:-:-:-:1      LOP.AND tidY,   tid,  1;
+02:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tidY, 3;
+// crst = blockI*128 + tid
+04:-:-:-:1      ISCADD crst, blkI, tidX, 7;
+// k = blockE*128 + tid
+04:-:-:-:1      ISCADD k, blkE, tidX, 7;
+--:-:-:-:1      IADD   k, k, param_offset_K;
+
+// c   = crst / RST
+// rst = crst % RST
+--:-:-:-:1      XMAD.LO2C c, crst, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32   c, c, param_shift_RST;
+--:-:-:-:1      XMAD rst, c, param_RST, RZ;
+--:-:-:-:1      IADD rst, -rst, crst;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = q * v - pad_w + (s * dil_w)
+// y = p * u - pad_h + (r * dil_h)
+// z = m * w - pad_d + (t * dil_d)
+--:-:-:-:1      XMAD  qs, Q,  param_str_w, RZ;
+--:-:-:-:1      XMAD  pr, p,  param_str_h, RZ;
+--:-:-:-:1      XMAD  mt, m,  param_str_d, RZ;
+--:-:-:-:1      XMAD  x,  s,  param_dil_w, qs;
+--:-:-:-:1      XMAD  y,  r,  param_dil_h, pr;
+--:-:-:-:1      XMAD  z,  t,  param_dil_d, mt;
+--:-:-:-:1      IADD  x,  x, -param_pad_w;
+--:-:-:-:1      IADD  y,  y, -param_pad_h;
+--:-:-:-:1      IADD  z,  z, -param_pad_d;
+// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY
+--:-:-:-:1      XMAD      ti, x, param_N,    tidY;
+--:-:-:-:1      XMAD.LO2C ti, y, param_WN,   ti;
+--:-:-:-:1      XMAD.LO2C ti, z, param_HWN,  ti;
+--:-:-:-:1      XMAD.LO2C ti, c, param_DHWN, ti;
+--:-:-:-:1      LEA      trackI0.CC, ti, param_I[0],     1;
+--:-:-:-:1      LEA.HI.X trackI1,    ti, param_I[1], RZ, 1;
+// trackE = k*MPQN + m*PQN + p*QN + tidY
+--:-:-:-:1      XMAD      te, Q, param_N,    tidY;
+--:-:-:-:1      XMAD.LO2C te, p, param_QN,   te;
+--:-:-:-:1      XMAD.LO2C te, m, param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te, k, param_MPQN, te;
+--:-:-:-:1      LEA      trackE0.CC, te, param_E[0],     1;
+--:-:-:-:1      LEA.HI.X trackE1,    te, param_E[1], RZ, 1;
+// Bounds check x,y,z,c for each I track.
+// If out of bounds, this will set the track address to -1
+--:-:-:-:1      ISET.GE.AND cC, c, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd, z, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD, z, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh, y, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH, y, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw, x, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW, x, param_W, PT;
+--:-:-:-:1      LOP.OR   trackI0, trackI0, cC;
+--:-:-:-:1      LOP3.LUT trackI0, trackI0, zd, zD, 0xfe;
+--:-:-:-:1      LOP3.LUT trackI0, trackI0, yh, yH, 0xfe;
+--:-:-:-:1      LOP3.LUT trackI0, trackI0, xw, xW, 0xfe;
+
+01:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:0      ISETP.NE.AND P2, PT, trackI0, -1, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, k, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+--:-:-:-:1      IADD loopN, loopN, param_N;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P0 BRA.U FIRST_LOAD;
+
+INIT_LOOP:
+
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*128 + 64>];
+--:-:1:-:2      LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>];
+
+NEXT_16N:
+
+[+
+
+    our $convert;
+    my %insert =
+    (
+        j0c8   => "--:-:-:-:1      IADD loopN, loopN, -16;\n",
+
+        # p0 = (N & 16) == 0
+        # p1 = N >= 32 && p0
+        j0c14   => "--:-:-:-:1      LOP.AND.NZ P0, RZ, loopN, 16;\n",
+        j0c28   => "--:-:-:-:1      ISETP.GE.AND P1, PT, loopN, 32, P0;\n",
+
+        j2c34   => "--:-:-:-:1  \@P0 $convert storeI7, loadI7.H1;\n",
+        j2c38   => "--:-:-:-:1  \@P0 $convert storeI6, loadI7.H0;\n",
+        j2c42   => "--:-:-:-:1  \@P0 $convert storeI5, loadI6.H1;\n",
+        j2c46   => "--:-:-:-:1  \@P0 $convert storeI4, loadI6.H0;\n",
+        j2c50   => "--:-:-:-:1  \@P0 $convert storeI3, loadI5.H1;\n",
+        j2c54   => "--:-:-:-:1  \@P0 $convert storeI2, loadI5.H0;\n",
+        j2c58   => "--:-:-:-:1  \@P0 $convert storeI1, loadI4.H1;\n",
+        j2c62   => "--:-:-:-:1  \@P0 $convert storeI0, loadI4.H0;\n",
+
+        j3c34   => "02:-:-:-:1 \@!P0 $convert storeI7, loadI3.H1;\n",
+        j3c38   => "--:-:-:-:1 \@!P0 $convert storeI6, loadI3.H0;\n",
+        j3c42   => "--:-:-:-:1 \@!P0 $convert storeI5, loadI2.H1;\n",
+        j3c46   => "--:-:5:-:1 \@!P0 $convert storeI4, loadI2.H0;\n",
+        j3c50   => "--:-:-:-:1 \@!P0 $convert storeI3, loadI1.H1;\n",
+        j3c54   => "--:-:-:-:1 \@!P0 $convert storeI2, loadI1.H0;\n",
+        j3c58   => "--:-:-:-:1 \@!P0 $convert storeI1, loadI0.H1;\n",
+        j3c62   => "--:-:2:-:1 \@!P0 $convert storeI0, loadI0.H0;\n",
+
+        j4c8    => "10:-:-:-:1      STS [writeS + 4x<7*128>], storeI7;\n",
+        j4c10   => "--:-:-:-:1      STS [writeS + 4x<6*128>], storeI6;\n",
+        j4c12   => "--:-:-:-:1      STS [writeS + 4x<5*128>], storeI5;\n",
+        j4c14   => "--:-:-:-:1      STS [writeS + 4x<4*128>], storeI4;\n",
+        j4c16   => "02:-:-:-:1      STS [writeS + 4x<3*128>], storeI3;\n",
+        j4c18   => "--:-:-:-:1      STS [writeS + 4x<2*128>], storeI2;\n",
+        j4c20   => "--:-:-:-:1      STS [writeS + 4x<1*128>], storeI1;\n",
+        j4c22   => "--:2:-:-:1      STS [writeS + 4x<0*128>], storeI0;\n",
+
+        j4c24   => "--:-:-:-:1      ISETP.NE.AND P2, PT, trackI0, -1, P1;\n",
+        j4c26   => "--:-:-:-:1      ISETP.EQ.AND P3, PT, trackI0, -1, P1;\n",
+
+        j5c8    => "02:-:-:-:1  \@P2 LDG.E.CI.128 loadI0, [trackI + 2x< 0>];\n",
+        j5c10   => "--:5:2:-:1  \@P2 LDG.E.CI.128 loadI4, [trackI + 2x<16>];\n",
+
+        j6c8    => "--:-:-:-:1  \@P3 LDS.U.128 loadI0, [addr_zero];\n",
+        j7c8    => "--:-:-:-:1  \@P3 LDS.U.128 loadI4, [addr_zero];\n",
+
+        j7c57   => "10:-:-:-:1  \@P2 IADD   trackI0.CC, trackI0, 2x<32>;\n",
+        j7c63   => "--:-:-:-:1  \@P2 IADD.X trackI1,    trackI1, RZ;\n",
+
+
+        j10c34  => "--:-:-:-:1  \@P0 $convert storeE7, loadE7.H1;\n",
+        j10c38  => "--:-:-:-:1  \@P0 $convert storeE6, loadE7.H0;\n",
+        j10c42  => "--:-:-:-:1  \@P0 $convert storeE5, loadE6.H1;\n",
+        j10c46  => "--:-:-:-:1  \@P0 $convert storeE4, loadE6.H0;\n",
+        j10c50  => "--:-:-:-:1  \@P0 $convert storeE3, loadE5.H1;\n",
+        j10c54  => "--:-:-:-:1  \@P0 $convert storeE2, loadE5.H0;\n",
+        j10c58  => "--:-:-:-:1  \@P0 $convert storeE1, loadE4.H1;\n",
+        j10c62  => "--:-:-:-:1  \@P0 $convert storeE0, loadE4.H0;\n",
+
+        j11c34  => "04:-:-:-:1 \@!P0 $convert storeE7, loadE3.H1;\n",
+        j11c38  => "--:-:-:-:1 \@!P0 $convert storeE6, loadE3.H0;\n",
+        j11c42  => "--:-:-:-:1 \@!P0 $convert storeE5, loadE2.H1;\n",
+        j11c46  => "--:-:5:-:1 \@!P0 $convert storeE4, loadE2.H0;\n",
+        j11c50  => "--:-:-:-:1 \@!P0 $convert storeE3, loadE1.H1;\n",
+        j11c54  => "--:-:-:-:1 \@!P0 $convert storeE2, loadE1.H0;\n",
+        j11c58  => "--:-:-:-:1 \@!P0 $convert storeE1, loadE0.H1;\n",
+        j11c62  => "--:-:3:-:1 \@!P0 $convert storeE0, loadE0.H0;\n",
+
+        j12c8   => "10:-:-:-:1      STS [writeS + 4x<7*128 + szBuf>], storeE7;\n",
+        j12c10  => "--:-:-:-:1      STS [writeS + 4x<6*128 + szBuf>], storeE6;\n",
+        j12c12  => "--:-:-:-:1      STS [writeS + 4x<5*128 + szBuf>], storeE5;\n",
+        j12c14  => "--:-:-:-:1      STS [writeS + 4x<4*128 + szBuf>], storeE4;\n",
+        j12c16  => "04:-:-:-:1      STS [writeS + 4x<3*128 + szBuf>], storeE3;\n",
+        j12c18  => "--:-:-:-:1      STS [writeS + 4x<2*128 + szBuf>], storeE2;\n",
+        j12c20  => "--:-:-:-:1      STS [writeS + 4x<1*128 + szBuf>], storeE1;\n",
+        j12c22  => "--:3:-:-:1      STS [writeS + 4x<0*128 + szBuf>], storeE0;\n",
+
+        j12c24  => "--:-:-:-:1      PSETP.AND.AND P2, PT, P1, P4, PT;\n",
+
+        j13c8   => "04:-:-:-:1  \@P2 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];\n",
+        j13c10  => "--:5:3:-:1  \@P2 LDG.E.CI.128 loadE4, [trackE + 2x<16>];\n",
+
+        j15c57  => "10:-:-:-:1  \@P2 IADD   trackE0.CC, trackE0, 2x<32>;\n",
+        j15c62  => "--:-:-:-:1  \@P2 IADD.X trackE1,    trackE1, RZ;\n",
+
+        # p0 = N >= 16 and not (N == 32 and (p or q))
+        j14c8   => "--:-:-:-:1      ISETP.EQ.AND  P0, PT, loopN, 32, PT;\n",
+        j14c10  => "--:-:-:-:1      ISETP.GE.AND  P1, PT, loopN, 16, PT;\n",
+        j14c22  => "--:-:-:-:1      PSETP.OR.AND  P0, PT, P5, P6, P0;\n",
+        j14c35  => "--:-:-:-:1      PSETP.AND.AND P0, PT, !P0, P1, PT;\n",
+
+        j14c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                   "20:-:-:-:1      IADD readEs, readEs, -swapBuf;\n" .
+                   "--:-:-:-:1      IADD readIs, readIs, -swapBuf;\n" .
+                   "--:-:-:-:1      IADD writeS, writeS,  swapBuf;\n" .
+                   "--:-:-:-:1      IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j15c63  => "--:-:-:Y:5  \@P0 BRA.U NEXT_16N;\n" .
+                   "--:-:-:-:0  \@P5 IADD q, q, param_grid_Q;\n" .
+                   "01:-:-:Y:5  \@P5 BRA.U NEXT_PQ;\n" .
+                   "--:-:1:-:1  \@P6 LDS q, [addr_q];\n" .
+                   "--:-:-:-:0  \@P6 IADD p, p, param_grid_P;\n" .
+                   "--:-:-:Y:5  \@P6 BRA.U NEXT_PQ;\n" .
+                   "--:-:-:Y:5      BRA.U FINISH;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 15)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = 1 - $odd;
+        my $rsOffset = ($j + 1) & 15;
+        my $rsPred   = $j == 15 ? '@P0' : '   ';
+        my $shift    = $rsOffset < 8 ? 0 : 1;
+        my $barrier  = $j == 14 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|F2F|I2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
++]
+
+FIRST_LOAD:
+
+--:-:-:-:8      PSETP.AND.AND P0, PT, PT, PT, !PT;
+
+// p1 = N == 32 and (p or q)
+--:-:-:-:0      ISETP.EQ.AND  P1, PT, loopN, 32, PT;
+
+--:-:-:-:1  @P2 LDG.E.CI.128 loadI0, [trackI + 2x< 0>];
+--:-:1:-:1  @P2 LDG.E.CI.128 loadI4, [trackI + 2x<16>];
+--:-:-:-:1 @!P2 LDS.U.128    loadI0, [addr_zero];
+--:-:5:-:1 @!P2 LDS.U.128    loadI4, [addr_zero];
+
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];
+--:-:2:-:1  @P4 LDG.E.CI.128 loadE4, [trackE + 2x<16>];
+--:-:-:-:1 @!P4 LDS.U.128    loadE0, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128    loadE4, [addr_zero];
+
+11:-:-:-:1      [+ convert() +] storeI7, loadI3.H1;
+--:-:-:-:1      [+ convert() +] storeI6, loadI3.H0;
+--:-:-:-:1      [+ convert() +] storeI5, loadI2.H1;
+--:-:1:-:1      [+ convert() +] storeI4, loadI2.H0;
+--:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:5:-:1      [+ convert() +] storeI0, loadI0.H0;
+
+--:-:-:-:1      PSETP.OR.AND  P1, PT, P5, P6, P1;
+
+--:-:-:-:0  @P2 IADD   trackI0.CC, trackI0, 2x<32>;
+
+01:-:-:-:1      STS [writeS + 4x<7*128>], storeI7;
+--:-:-:-:1      STS [writeS + 4x<6*128>], storeI6;
+--:-:-:-:1      STS [writeS + 4x<5*128>], storeI5;
+--:-:-:-:1      STS [writeS + 4x<4*128>], storeI4;
+10:-:-:-:1      STS [writeS + 4x<3*128>], storeI3;
+--:-:-:-:1      STS [writeS + 4x<2*128>], storeI2;
+--:-:-:-:1      STS [writeS + 4x<1*128>], storeI1;
+--:1:-:-:2      STS [writeS + 4x<0*128>], storeI0;
+
+--:-:-:-:0  @P2 IADD.X trackI1,    trackI1, RZ;
+
+23:-:-:-:1      [+ convert() +] storeE7, loadE3.H1;
+--:-:-:-:1      [+ convert() +] storeE6, loadE3.H0;
+--:-:-:-:1      [+ convert() +] storeE5, loadE2.H1;
+--:-:2:-:1      [+ convert() +] storeE4, loadE2.H0;
+--:-:-:-:1      [+ convert() +] storeE3, loadE1.H1;
+--:-:-:-:1      [+ convert() +] storeE2, loadE1.H0;
+--:-:-:-:1      [+ convert() +] storeE1, loadE0.H1;
+--:-:6:-:1      [+ convert() +] storeE0, loadE0.H0;
+
+--:-:-:-:2      PSETP.AND.AND P5, PT, P5, P1, PT;
+--:-:-:-:1      PSETP.AND.AND P6, PT, P6, P1, PT;
+--:-:-:-:0  @P4 IADD   trackE0.CC, trackE0, 2x<32>;
+
+02:-:-:-:1      STS [writeS + 4x<7*128 + szBuf>], storeE7;
+--:-:-:-:1      STS [writeS + 4x<6*128 + szBuf>], storeE6;
+--:-:-:-:1      STS [writeS + 4x<5*128 + szBuf>], storeE5;
+--:-:-:-:1      STS [writeS + 4x<4*128 + szBuf>], storeE4;
+20:-:-:-:1      STS [writeS + 4x<3*128 + szBuf>], storeE3;
+--:-:-:-:1      STS [writeS + 4x<2*128 + szBuf>], storeE2;
+--:-:-:-:1      STS [writeS + 4x<1*128 + szBuf>], storeE1;
+--:1:-:-:1      STS [writeS + 4x<0*128 + szBuf>], storeE0;
+
+--:-:-:-:1  @P4 IADD.X trackE1,    trackE1, RZ;
+
+--:-:-:-:1      IADD readEs, readEs, -swapBuf;
+--:-:-:-:0      IADD readIs, readIs, -swapBuf;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS,  swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,    -swapBuf;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:0  @P5 IADD q, q, param_grid_Q;
+--:-:-:Y:5  @P5 BRA.U NEXT_PQ;
+--:-:-:-:0  @P6 IADD p, p, param_grid_P;
+--:-:-:Y:5  @P6 BRA.U NEXT_PQ;
+
+--:-:-:-:2      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:Y:5      BRA.U INIT_LOOP;
+
+
+FINISH:
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID2;
+--:-:2:-:1      S2R blkI,    SR_CTAID.Y;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Z;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.X;
+--:-:-:-:5      BRA.U END_CTAID2;
+CTAID2:
+--:-:2:-:1      S2R blkI,    SR_CTAID.X;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Y;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.Z;
+END_CTAID2:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readEs,  readEs, -4x<szBuf>;
+--:-:-:-:1  @P0 IADD readIs,  readIs, -swapBuf;
+--:-:-:-:1  @P0 IADD readEs,  readEs, -swapBuf;
+
+// writeCs = (readIs / 4) * 128 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 5;
+
+--:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      LOP.AND tid96,  tid,  96;
+01:-:-:-:1      LOP.AND t128,   tid, 128;
+
+// kk = tid31 | (t128 >> 2);
+--:-:-:-:1      SHR.U32  kk, t128, 2;
+--:-:-:-:1      LOP.OR   kk, tid31,  kk;
+
+// readCs = ((tid96 << 4) | kk) << 2;
+--:-:-:-:1      SHL      readCs, tid96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, kk;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// kk += blkE*128;
+04:-:-:-:1      ISCADD kk, blkE, kk, 7;
+--:-:-:-:1      IADD   kk, kk, param_offset_K;
+
+// crst = blkI*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 crst00, tid96, 1;
+02:-:-:-:1      ISCADD  crst00, blkI, crst00, 7;
+--:-:-:-:1      IADD    crst04, crst00,  4;
+--:-:-:-:1      IADD    crst08, crst00,  8;
+--:-:-:-:1      IADD    crst12, crst00,  12;
+
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1, K, 2;
+--:-:-:-:1      SHL K4, K, 4;
+--:-:-:-:1      ISCADD K60, K, -K4, 8;
+
+// trackF += crst*K + k;
+--:-:-:-:1      VMAD.U16.U16 tf, crst00, K, kk;
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSTK, param_CRSTK;
+08:-:-:-:1      XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ;
+        };
+    }
+    return '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0],     0x2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 0x2;
+
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P5, PT, kk, param_K, PT;
+--:-:-:-:1      IADD kk, kk, 64;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, PT;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   track04F0.CC, track00F0, K4;
+--:-:-:-:1      IADD.X track04F1,    track00F1, RZ;
+--:-:-:-:6      IADD   track08F0.CC, track04F0, K4;
+--:-:-:-:1      IADD.X track08F1,    track04F1, RZ;
+--:-:-:-:6      IADD   track12F0.CC, track08F0, K4;
+--:-:-:-:0      IADD.X track12F1,    track08F1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+[+
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   track00F0.CC, track00F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst00,       crst00,     60;\n" .
+            "--:-:-:-:1      IADD.X track00F1,    track00F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track04F0.CC, track04F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst04,       crst04,     60;\n" .
+            "--:-:-:-:1      IADD.X track04F1,    track04F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track08F0.CC, track08F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst08,       crst08,     60;\n" .
+            "--:-:-:-:1      IADD.X track08F1,    track08F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track12F0.CC, track12F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst12,       crst12,     60;\n" .
+            "--:-:-:-:1      IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL f0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL f7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
++]
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
+--:-:-:-:1      IADD         crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
+--:-:-:-:1      IADD         crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
+--:-:-:-:1      IADD         crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
+--:-:-:-:0      IADD         crst12, crst12, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], f0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], f4;
+
+--:-:1:-:1      LDS f0, [readCs + 4x<0*128 + 00>];
+--:-:2:-:1      LDS f2, [readCs + 4x<1*128 + 00>];
+--:-:3:-:1      LDS f4, [readCs + 4x<2*128 + 00>];
+--:-:4:-:a      LDS f6, [readCs + 4x<3*128 + 00>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:-:-:-:1  @P0 STG.E.CG [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 STG.E.CG [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 STG.E.CG [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 STG.E.CG [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
+    else
+    {
+        return q{
+01:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
++]
+
+--:-:1:-:1      LDS f1, [readCs + 4x<0*128 + 64>];
+--:-:2:-:1      LDS f3, [readCs + 4x<1*128 + 64>];
+--:-:3:-:1      LDS f5, [readCs + 4x<2*128 + 64>];
+--:-:4:-:a      LDS f7, [readCs + 4x<3*128 + 64>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:1:-:-:1  @P0 STG.E.CG [track00F + 4x<64>], f1;
+02:2:-:-:1  @P1 STG.E.CG [track04F + 4x<64>], f3;
+04:3:-:-:1  @P2 STG.E.CG [track08F + 4x<64>], f5;
+08:4:-:-:1  @P3 STG.E.CG [track12F + 4x<64>], f7;
+        };
+    }
+    else
+    {
+        return q{
+01:1:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], f1;
+02:2:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], f3;
+04:3:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], f5;
+08:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], f7;
+        };
+    }
++]
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Maxwell/hconv_updat_C128_K64.sass b/Kernel/Convolution/Maxwell/hconv_updat_C128_K64.sass
new file mode 100644
index 0000000..a40fcb8
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/hconv_updat_C128_K64.sass
@@ -0,0 +1,860 @@
+# Kernel: hconv_updat_C128_K64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + (64*16 + 32)*2>
+    szShareI  : (128*16 + 32)
+    szShareE  : (64*16  + 32)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_offset_K     : c[0x0][0x15c]
+    param_N            : c[0x0][0x160]
+    param_K            : c[0x0][0x164]
+    param_D            : c[0x0][0x168]
+    param_H            : c[0x0][0x16c]
+    param_W            : c[0x0][0x170]
+    param_WN           : c[0x0][0x174]
+    param_HWN          : c[0x0][0x178]
+    param_DHWN         : c[0x0][0x17c]
+    param_C            : c[0x0][0x180]
+    param_CRST         : c[0x0][0x184]
+    param_RST          : c[0x0][0x188]
+    param_magic_RST    : c[0x0][0x18c]
+    param_shift_RST    : c[0x0][0x190]
+    param_RS           : c[0x0][0x194]
+    param_magic_RS     : c[0x0][0x198]
+    param_shift_RS     : c[0x0][0x19c]
+    param_S            : c[0x0][0x1a0]
+    param_magic_S      : c[0x0][0x1a4]
+    param_shift_S      : c[0x0][0x1a8]
+    param_pad_d        : c[0x0][0x1ac]
+    param_pad_h        : c[0x0][0x1b0]
+    param_pad_w        : c[0x0][0x1b4]
+    param_str_d        : c[0x0][0x1b8]
+    param_str_h        : c[0x0][0x1bc]
+    param_str_w        : c[0x0][0x1c0]
+    param_dil_d        : c[0x0][0x1c4]
+    param_dil_h        : c[0x0][0x1c8]
+    param_dil_w        : c[0x0][0x1cc]
+    param_P            : c[0x0][0x1d0]
+    param_Q            : c[0x0][0x1d4]
+    param_PQ           : c[0x0][0x1d8]
+    param_QN           : c[0x0][0x1dc]
+    param_PQN          : c[0x0][0x1e0]
+    param_MPQN         : c[0x0][0x1e4]
+    param_magic_Q      : c[0x0][0x1e8]
+    param_shift_Q      : c[0x0][0x1ec]
+    param_magic_PQ     : c[0x0][0x1f0]
+    param_shift_PQ     : c[0x0][0x1f4]
+    param_grid_P       : c[0x0][0x1f8]
+    param_grid_Q       : c[0x0][0x1fc]
+    param_grid_PQ      : c[0x0][0x200]
+    param_CRSTK        : c[0x0][0x204]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-67   ~ tid, blkI, blkE, one
+    68-99   ~ blkMPQ, tidX, tid1, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+    64-72   ~ c<0-1>, z<0-1>, y<0-1>, x<0-1>, Q
+    73-99   ~ mt, pr, qs, r<0-1>, s<0-1>, t<0-1>, rst<0-1>, rs<0-1>
+    73-99   ~ te, ti<0-1>, xw<0-1>, xW<0-1>, yh<0-1>, yH<0-1>, zd<0-1>, zD<0-1>, cC<0-1>, nextP, nextQ
+
+    64-79   : j0Ex<0-7>, j0Iy<0-7>
+    80-95   : j1Ex<0-7>, j1Iy<0-7>
+
+    100-131 : load0I<0-7>,  load1I<0-7>,  loadE<0-7>, storeX<0-7>
+    132-137 : track0I<0-1>, track1I<0-1>, trackE<0-1>
+
+    138-164 ~ writeIs, writeEs, loopN, m, p, q, qq, k, crst<0-1>, tidY
+    165-167 ~ readIs, readEs, swapBuf
+
+     68-83  : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+     84-164 ~ K, K4, K1, K60, tid31, tid96, kk, tf, writeCs, readCs, crst<00|04|08|12>, alpha, blk_MPQ, CRSTK, xmad_determ
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID1;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkI,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+--:-:-:-:5      BRA.U END_CTAID1;
+CTAID1:
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.Z;
+--:-:3:-:1      S2R blkI,   SR_CTAID.X;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Y;
+END_CTAID1:
+
+<SCHEDULE_BLOCK>
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 3
+// shiftX = (tid & 1) << 4
+01:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tid1, 3;
+--:-:-:-:1      SHL     shiftX, tid1, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_grid_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_grid_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+--:-:-:-:1      MOV qq, q;
+
+// writeIs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeIs, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeIs, writeIs, shiftX;
+--:-:-:-:1      ISCADD writeIs, writeIs, 4x<szShareI + szShareE>, 2;
+
+// writeEs = (tidY*64 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeEs, tidY, tidX, 6;
+--:-:-:-:1      IADD   writeEs, writeEs, shiftX;
+--:-:-:-:1      ISCADD writeEs, writeEs, 4x<szShareI*2 + szShareE>, 2;
+
+// readIs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readIs, tid,   -16;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<szShareI>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareI + szShareE>;
+
+// crst = blockI*128 + tid
+04:-:-:-:1      ISCADD crst0, blkI, tidX, 7;
+--:-:-:-:1      IADD   crst1, crst0, 64;
+
+// k = blockE*64 + tid
+08:-:-:-:1      ISCADD k, blkE, tidX, 6;
+--:-:-:-:1      IADD   k, k, param_offset_K;
+
+--:-:-:-:1      MOV loopN, RZ;
+
+// Flag for first load branch
+--:-:-:-:1      PSETP.AND.AND P0, PT, PT, PT, PT;
+</SCHEDULE_BLOCK>
+
+NEXT_PQ:
+
+<SCHEDULE_BLOCK>
+// Zigzag q but only if grid_P < P
+--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
+--:-:-:-:1      MOV Q, param_grid_P;
+--:-:-:-:1      ISETP.LT.AND P1, PT, Q, param_P, P1;
+--:-:-:-:1      MOV Q, -1;
+--:-:-:-:1  @P1 IADD3 Q, -q, param_Q, Q;
+--:-:-:-:1 @!P1 MOV Q, q;
+// c   = crst / RST
+// rst = crst % RST
+--:-:-:-:1      XMAD.LO2C  c0, crst0, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32    c0, c0, param_shift_RST;
+--:-:-:-:1      XMAD rst0, c0, param_RST, RZ;
+--:-:-:-:1      IADD rst0, -rst0, crst0;
+--:-:-:-:1      XMAD.LO2C  c1, crst1, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32    c1, c1, param_shift_RST;
+--:-:-:-:1      XMAD rst1, c1, param_RST, RZ;
+--:-:-:-:1      IADD rst1, -rst1, crst1;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C  t0, rst0, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32    t0, t0, param_shift_RS;
+--:-:-:-:1      XMAD  rs0, t0, param_RS, RZ;
+--:-:-:-:1      IADD  rs0, -rs0, rst0;
+--:-:-:-:1      XMAD.LO2C  t1, rst1, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32    t1, t1, param_shift_RS;
+--:-:-:-:1      XMAD  rs1, t1, param_RS, RZ;
+--:-:-:-:1      IADD  rs1, -rs1, rst1;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C  r0, rs0, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32    r0, r0, param_shift_S;
+--:-:-:-:1      XMAD   s0, r0, param_S, RZ;
+--:-:-:-:1      IADD   s0, -s0, rs0;
+--:-:-:-:1      XMAD.LO2C  r1, rs1, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32    r1, r1, param_shift_S;
+--:-:-:-:1      XMAD   s1, r1, param_S, RZ;
+--:-:-:-:1      IADD   s1, -s1, rs1;
+// z = m * w - pad_d + t
+// y = p * u - pad_h + r
+// x = q * v - pad_w + s
+--:-:-:-:1      XMAD  mt, m,   param_str_d, RZ;
+--:-:-:-:1      XMAD  pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD  qs, Q,   param_str_w, RZ;
+--:-:-:-:1      XMAD  z1, t1,  param_dil_d, mt;
+--:-:-:-:1      XMAD  y1, r1,  param_dil_h, pr;
+--:-:-:-:1      XMAD  x1, s1,  param_dil_w, qs;
+--:-:-:-:1      XMAD  z0, t0,  param_dil_d, mt;
+--:-:-:-:1      XMAD  y1, r0,  param_dil_h, pr;
+--:-:-:-:1      XMAD  x1, s0,  param_str_w, qs;
+--:-:-:-:1      IADD  z1, z1, -param_pad_d;
+--:-:-:-:1      IADD  y1, y1, -param_pad_h;
+--:-:-:-:1      IADD  x1, x1, -param_pad_w;
+--:-:-:-:1      IADD  z0, z0, -param_pad_d;
+--:-:-:-:1      IADD  y0, y0, -param_pad_h;
+--:-:-:-:1      IADD  x0, x0, -param_pad_w;
+</SCHEDULE_BLOCK>
+
+// Split blocks to fit inside of 36 registers
+<SCHEDULE_BLOCK>
+
+// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY
+--:-:-:-:1      XMAD.LO2C ti0, c0, param_DHWN, tidY;
+--:-:-:-:1      XMAD.LO2C ti0, z0, param_HWN,  ti0;
+--:-:-:-:1      XMAD.LO2C ti0, y0, param_WN,   ti0;
+--:-:-:-:1      XMAD      ti0, x0, param_N,    ti0;
+--:-:-:-:1      XMAD.LO2C ti1, c1, param_DHWN, tidY;
+--:-:-:-:1      XMAD.LO2C ti1, z1, param_HWN,  ti1;
+--:-:-:-:1      XMAD.LO2C ti1, y1, param_WN,   ti1;
+--:-:-:-:1      XMAD      ti1, x1, param_N,    ti1;
+--:-:-:-:1      LEA      track0I0.CC, ti0, param_I[0],     1;
+--:-:-:-:1      LEA.HI.X track0I1,    ti0, param_I[1], RZ, 1;
+--:-:-:-:1      LEA      track1I0.CC, ti1, param_I[0],     1;
+--:-:-:-:1      LEA.HI.X track1I1,    ti1, param_I[1], RZ, 1;
+
+// trackE = k*MPQN + m*PQN + p*QN + tidY
+--:-:-:-:1      XMAD.LO2C te, k, param_MPQN, tidY;
+--:-:-:-:1      XMAD.LO2C te, m, param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te, p, param_QN,   te;
+--:-:-:-:1      XMAD      te, Q, param_N,    te;
+--:-:-:-:1      LEA      trackE0.CC, te, param_E[0],     1;
+--:-:-:-:0      LEA.HI.X trackE1,    te, param_E[1], RZ, 1;
+
+// Bounds check x,y,z,c for each I track.
+// If out of bounds, this will set the track address to -1
+--:-:-:-:1      ISET.GE.AND cC0, c0, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd0, z0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD0, z0, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh0, y0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH0, y0, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw0, x0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW0, x0, param_W, PT;
+--:-:-:-:1      LOP.OR   track0I0, track0I0, cC0;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, zd0, zD0, 0xfe;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, yh0, yH0, 0xfe;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, xw0, xW0, 0xfe;
+
+--:-:-:-:1      ISET.GE.AND cC1, c1, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd1, z1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD1, z1, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh1, y1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH1, y1, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw1, x1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW1, x1, param_W, PT;
+--:-:-:-:1      LOP.OR   track1I0, track1I0, cC1;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, zd1, zD1, 0xfe;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, yh1, yH1, 0xfe;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, xw1, xW1, 0xfe;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, PT;
+--:-:-:-:0      ISETP.NE.AND P3, PT, track1I0, -1, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, k, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:-:1      IADD loopN, loopN, param_N;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P0 BRA.U FIRST_LOAD;
+
+INIT_LOOP:
+
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64  + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64  + 32>];
+--:-:1:-:2      LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>];
+
+NEXT_16N:
+
+[+
+
+    our $convert;
+    my %insert =
+    (
+        j0c8   => "--:-:-:-:1      IADD loopN, loopN, -16;\n",
+
+        # p0 = (N & 16) == 0
+        # p1 = N >= 32 && p0
+        j0c14   => "--:-:-:-:1      LOP.AND.NZ P0, RZ, loopN, 16;\n",
+        j0c28   => "--:-:-:-:1      ISETP.GE.AND P1, PT, loopN, 32, P0;\n",
+
+        j0c34   => "--:-:-:-:1  \@P0 $convert storeX7, load0I7.H1;\n",
+        j0c38   => "--:-:-:-:1  \@P0 $convert storeX6, load0I7.H0;\n",
+        j0c42   => "--:-:-:-:1  \@P0 $convert storeX5, load0I6.H1;\n",
+        j0c46   => "--:-:-:-:1  \@P0 $convert storeX4, load0I6.H0;\n",
+        j0c50   => "--:-:-:-:1  \@P0 $convert storeX3, load0I5.H1;\n",
+        j0c54   => "--:-:-:-:1  \@P0 $convert storeX2, load0I5.H0;\n",
+        j0c58   => "--:-:-:-:1  \@P0 $convert storeX1, load0I4.H1;\n",
+        j0c62   => "--:-:-:-:1  \@P0 $convert storeX0, load0I4.H0;\n",
+
+        j1c34   => "02:-:-:-:1 \@!P0 $convert storeX7, load0I3.H1;\n",
+        j1c38   => "--:-:-:-:1 \@!P0 $convert storeX6, load0I3.H0;\n",
+        j1c42   => "--:-:-:-:1 \@!P0 $convert storeX5, load0I2.H1;\n",
+        j1c46   => "--:-:5:-:1 \@!P0 $convert storeX4, load0I2.H0;\n",
+        j1c50   => "--:-:-:-:1 \@!P0 $convert storeX3, load0I1.H1;\n",
+        j1c54   => "--:-:-:-:1 \@!P0 $convert storeX2, load0I1.H0;\n",
+        j1c58   => "--:-:-:-:1 \@!P0 $convert storeX1, load0I0.H1;\n",
+        j1c62   => "--:-:2:-:1 \@!P0 $convert storeX0, load0I0.H0;\n",
+
+        j2c8    => "10:-:-:-:1      STS [writeIs + 4x<7*128 +  0>], storeX7;\n",
+        j2c10   => "--:-:-:-:1      STS [writeIs + 4x<6*128 +  0>], storeX6;\n",
+        j2c12   => "--:-:-:-:1      STS [writeIs + 4x<5*128 +  0>], storeX5;\n",
+        j2c14   => "--:-:-:-:1      STS [writeIs + 4x<4*128 +  0>], storeX4;\n",
+        j2c16   => "02:-:-:-:1      STS [writeIs + 4x<3*128 +  0>], storeX3;\n",
+        j2c18   => "--:-:-:-:1      STS [writeIs + 4x<2*128 +  0>], storeX2;\n",
+        j2c20   => "--:-:-:-:1      STS [writeIs + 4x<1*128 +  0>], storeX1;\n",
+        j2c22   => "--:2:-:-:1      STS [writeIs + 4x<0*128 +  0>], storeX0;\n",
+
+        j2c24   => "--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, P1;\n",
+        j2c26   => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track0I0, -1, P1;\n",
+
+        j3c8    => "02:-:-:-:1  \@P2 LDG.E.CI.128 load0I0, [track0I + 2x< 0>];\n",
+        j3c10   => "--:5:2:-:1  \@P2 LDG.E.CI.128 load0I4, [track0I + 2x<16>];\n",
+
+        j4c8    => "--:-:-:-:1  \@P3 LDS.U.128 load0I0, [addr_zero];\n",
+        j5c8    => "--:-:-:-:1  \@P3 LDS.U.128 load0I4, [addr_zero];\n",
+
+        j5c57   => "10:-:-:-:1  \@P2 IADD   track0I0.CC, track0I0, 2x<32>;\n",
+        j5c63   => "--:-:-:-:1  \@P2 IADD.X track0I1,    track0I1, RZ;\n",
+
+
+        j5c34   => "--:-:-:-:1  \@P0 $convert storeX7, load1I7.H1;\n",
+        j5c38   => "--:-:-:-:1  \@P0 $convert storeX6, load1I7.H0;\n",
+        j5c42   => "--:-:-:-:1  \@P0 $convert storeX5, load1I6.H1;\n",
+        j5c46   => "--:-:-:-:1  \@P0 $convert storeX4, load1I6.H0;\n",
+        j5c50   => "--:-:-:-:1  \@P0 $convert storeX3, load1I5.H1;\n",
+        j5c54   => "--:-:-:-:1  \@P0 $convert storeX2, load1I5.H0;\n",
+        j5c58   => "--:-:-:-:1  \@P0 $convert storeX1, load1I4.H1;\n",
+        j5c62   => "--:-:-:-:1  \@P0 $convert storeX0, load1I4.H0;\n",
+
+        j6c34   => "04:-:-:-:1 \@!P0 $convert storeX7, load1I3.H1;\n",
+        j6c38   => "--:-:-:-:1 \@!P0 $convert storeX6, load1I3.H0;\n",
+        j6c42   => "--:-:-:-:1 \@!P0 $convert storeX5, load1I2.H1;\n",
+        j6c46   => "--:-:5:-:1 \@!P0 $convert storeX4, load1I2.H0;\n",
+        j6c50   => "--:-:-:-:1 \@!P0 $convert storeX3, load1I1.H1;\n",
+        j6c54   => "--:-:-:-:1 \@!P0 $convert storeX2, load1I1.H0;\n",
+        j6c58   => "--:-:-:-:1 \@!P0 $convert storeX1, load1I0.H1;\n",
+        j6c62   => "--:-:3:-:1 \@!P0 $convert storeX0, load1I0.H0;\n",
+
+        j7c8    => "10:-:-:-:1      STS [writeIs + 4x<7*128 + 64>], storeX7;\n",
+        j7c10   => "--:-:-:-:1      STS [writeIs + 4x<6*128 + 64>], storeX6;\n",
+        j7c12   => "--:-:-:-:1      STS [writeIs + 4x<5*128 + 64>], storeX5;\n",
+        j7c14   => "--:-:-:-:1      STS [writeIs + 4x<4*128 + 64>], storeX4;\n",
+        j7c16   => "04:-:-:-:1      STS [writeIs + 4x<3*128 + 64>], storeX3;\n",
+        j7c18   => "--:-:-:-:1      STS [writeIs + 4x<2*128 + 64>], storeX2;\n",
+        j7c20   => "--:-:-:-:1      STS [writeIs + 4x<1*128 + 64>], storeX1;\n",
+        j7c22   => "--:3:-:-:1      STS [writeIs + 4x<0*128 + 64>], storeX0;\n",
+
+        j7c24   => "--:-:-:-:1      ISETP.NE.AND P2, PT, track1I0, -1, P1;\n",
+        j7c26   => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track1I0, -1, P1;\n",
+
+        j8c8    => "04:-:-:-:1  \@P2 LDG.E.CI.128 load1I0, [track1I + 2x< 0>];\n",
+        j8c10   => "--:5:3:-:1  \@P2 LDG.E.CI.128 load1I4, [track1I + 2x<16>];\n",
+
+        j9c8    => "--:-:-:-:1  \@P3 LDS.U.128 load1I0, [addr_zero];\n",
+        j10c8   => "--:-:-:-:1  \@P3 LDS.U.128 load1I4, [addr_zero];\n",
+
+        j10c57  => "10:-:-:-:1  \@P2 IADD   track1I0.CC, track1I0, 2x<32>;\n",
+        j10c63  => "--:-:-:-:1  \@P2 IADD.X track1I1,    track1I1, RZ;\n",
+
+
+        j10c34  => "--:-:-:-:1  \@P0 $convert storeX7, loadE7.H1;\n",
+        j10c38  => "--:-:-:-:1  \@P0 $convert storeX6, loadE7.H0;\n",
+        j10c42  => "--:-:-:-:1  \@P0 $convert storeX5, loadE6.H1;\n",
+        j10c46  => "--:-:-:-:1  \@P0 $convert storeX4, loadE6.H0;\n",
+        j10c50  => "--:-:-:-:1  \@P0 $convert storeX3, loadE5.H1;\n",
+        j10c54  => "--:-:-:-:1  \@P0 $convert storeX2, loadE5.H0;\n",
+        j10c58  => "--:-:-:-:1  \@P0 $convert storeX1, loadE4.H1;\n",
+        j10c62  => "--:-:-:-:1  \@P0 $convert storeX0, loadE4.H0;\n",
+
+        j11c34  => "08:-:-:-:1 \@!P0 $convert storeX7, loadE3.H1;\n",
+        j11c38  => "--:-:-:-:1 \@!P0 $convert storeX6, loadE3.H0;\n",
+        j11c42  => "--:-:-:-:1 \@!P0 $convert storeX5, loadE2.H1;\n",
+        j11c46  => "--:-:5:-:1 \@!P0 $convert storeX4, loadE2.H0;\n",
+        j11c50  => "--:-:-:-:1 \@!P0 $convert storeX3, loadE1.H1;\n",
+        j11c54  => "--:-:-:-:1 \@!P0 $convert storeX2, loadE1.H0;\n",
+        j11c58  => "--:-:-:-:1 \@!P0 $convert storeX1, loadE0.H1;\n",
+        j11c62  => "--:-:4:-:1 \@!P0 $convert storeX0, loadE0.H0;\n",
+
+        j12c8   => "10:-:-:-:1      STS [writeEs + 4x<7*64>], storeX7;\n",
+        j12c10  => "--:-:-:-:1      STS [writeEs + 4x<6*64>], storeX6;\n",
+        j12c12  => "--:-:-:-:1      STS [writeEs + 4x<5*64>], storeX5;\n",
+        j12c14  => "--:-:-:-:1      STS [writeEs + 4x<4*64>], storeX4;\n",
+        j12c16  => "08:-:-:-:1      STS [writeEs + 4x<3*64>], storeX3;\n",
+        j12c18  => "--:-:-:-:1      STS [writeEs + 4x<2*64>], storeX2;\n",
+        j12c20  => "--:-:-:-:1      STS [writeEs + 4x<1*64>], storeX1;\n",
+        j12c22  => "--:4:-:-:1      STS [writeEs + 4x<0*64>], storeX0;\n",
+
+        j12c24  => "--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K,  P1;\n",
+
+        j13c8   => "08:-:-:-:1  \@P2 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];\n",
+        j13c10  => "--:5:4:-:1  \@P2 LDG.E.CI.128 loadE4, [trackE + 2x<16>];\n",
+
+        j15c57  => "10:-:-:-:1  \@P2 IADD   trackE0.CC, trackE0, 2x<32>;\n",
+        j15c62  => "--:-:-:-:1  \@P2 IADD.X trackE1,    trackE1, RZ;\n",
+
+        # p0 = N >= 16 and not (N == 32 and (p or q))
+        j14c8   => "--:-:-:-:1      ISETP.EQ.AND  P0, PT, loopN, 32, PT;\n",
+        j14c10  => "--:-:-:-:1      ISETP.GE.AND  P1, PT, loopN, 16, PT;\n",
+        j14c22  => "--:-:-:-:1      PSETP.OR.AND  P0, PT, P5, P6, P0;\n",
+        j14c35  => "--:-:-:-:1      PSETP.AND.AND P0, PT, !P0, P1, PT;\n",
+
+        j14c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                   "20:-:-:-:1      IADD readEs,  readEs, -swapBuf;\n" .
+                   "--:-:-:-:1      IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1      IADD writeEs, writeEs, swapBuf;\n" .
+                   "--:-:-:-:1      IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j15c63  => "--:-:-:Y:5  \@P0 BRA.U NEXT_16N;\n" .
+                   "--:-:-:-:0  \@P5 IADD q, q, param_grid_Q;\n" .
+                   "01:-:-:Y:5  \@P5 BRA.U NEXT_PQ;\n" .
+                   "--:-:-:-:1  \@P6 MOV  q, qq;\n" .
+                   "--:-:-:-:0  \@P6 IADD p, p, param_grid_P;\n" .
+                   "--:-:-:Y:5  \@P6 BRA.U NEXT_PQ;\n" .
+                   "--:-:-:Y:5      BRA.U FINISH;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 15)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = 1 - $odd;
+        my $rsOffset = ($j + 1) & 15;
+        my $rsPred   = $j == 15 ? '@P0' : '   ';
+        my $shift    = $rsOffset < 8 ? 0 : 1;
+        my $barrier  = $j == 14 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64  + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64  + 32 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|F2F|I2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
++]
+
+FIRST_LOAD:
+
+--:-:-:-:8      PSETP.AND.AND P0, PT, PT, PT, !PT;
+
+--:-:-:-:1  @P2 LDG.E.CI.128 load0I0, [track0I + 2x< 0>];
+--:-:1:-:1  @P2 LDG.E.CI.128 load0I4, [track0I + 2x<16>];
+--:-:-:-:1 @!P2 LDS.U.128    load0I0, [addr_zero];
+--:-:4:-:1 @!P2 LDS.U.128    load0I4, [addr_zero];
+
+// p1 = N == 32 and (p or q)
+--:-:-:-:0      ISETP.EQ.AND  P1, PT, loopN, 32, PT;
+
+--:-:-:-:1  @P3 LDG.E.CI.128 load1I0, [track1I + 2x< 0>];
+--:-:2:-:1  @P3 LDG.E.CI.128 load1I4, [track1I + 2x<16>];
+--:-:-:-:1 @!P3 LDS.U.128    load1I0, [addr_zero];
+--:-:5:-:1 @!P3 LDS.U.128    load1I4, [addr_zero];
+
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];
+--:-:3:-:1  @P4 LDG.E.CI.128 loadE4, [trackE + 2x<16>];
+--:-:-:-:1 @!P4 LDS.U.128    loadE0, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128    loadE4, [addr_zero];
+
+
+09:-:-:-:1      [+ convert() +] storeX7, load0I3.H1;
+--:-:-:-:1      [+ convert() +] storeX6, load0I3.H0;
+--:-:-:-:1      [+ convert() +] storeX5, load0I2.H1;
+--:-:1:-:1      [+ convert() +] storeX4, load0I2.H0;
+--:-:-:-:1      [+ convert() +] storeX3, load0I1.H1;
+--:-:-:-:1      [+ convert() +] storeX2, load0I1.H0;
+--:-:-:-:1      [+ convert() +] storeX1, load0I0.H1;
+--:-:4:-:1      [+ convert() +] storeX0, load0I0.H0;
+
+--:-:-:-:1      PSETP.OR.AND  P1, PT, P5, P6, P1;
+--:-:-:-:0  @P2 IADD   track0I0.CC, track0I0, 2x<32>;
+
+01:-:-:-:1      STS [writeIs + 4x<7*128 +  0>], storeX7;
+--:-:-:-:1      STS [writeIs + 4x<6*128 +  0>], storeX6;
+--:-:-:-:1      STS [writeIs + 4x<5*128 +  0>], storeX5;
+--:-:-:-:1      STS [writeIs + 4x<4*128 +  0>], storeX4;
+08:-:-:-:1      STS [writeIs + 4x<3*128 +  0>], storeX3;
+--:-:-:-:1      STS [writeIs + 4x<2*128 +  0>], storeX2;
+--:-:-:-:1      STS [writeIs + 4x<1*128 +  0>], storeX1;
+--:1:-:-:2      STS [writeIs + 4x<0*128 +  0>], storeX0;
+
+--:-:-:-:0  @P2 IADD.X track0I1,    track0I1, RZ;
+
+13:-:-:-:1      [+ convert() +] storeX7, load1I3.H1;
+--:-:-:-:1      [+ convert() +] storeX6, load1I3.H0;
+--:-:-:-:1      [+ convert() +] storeX5, load1I2.H1;
+--:-:2:-:1      [+ convert() +] storeX4, load1I2.H0;
+--:-:-:-:1      [+ convert() +] storeX3, load1I1.H1;
+--:-:-:-:1      [+ convert() +] storeX2, load1I1.H0;
+--:-:-:-:1      [+ convert() +] storeX1, load1I0.H1;
+--:-:5:-:1      [+ convert() +] storeX0, load1I0.H0;
+
+--:-:-:-:1      PSETP.AND.AND P5, PT, P5, P1, PT;
+--:-:-:-:0  @P3 IADD   track1I0.CC, track1I0, 2x<32>;
+
+02:-:-:-:1      STS [writeIs + 4x<7*128 + 64>], storeX7;
+--:-:-:-:1      STS [writeIs + 4x<6*128 + 64>], storeX6;
+--:-:-:-:1      STS [writeIs + 4x<5*128 + 64>], storeX5;
+--:-:-:-:1      STS [writeIs + 4x<4*128 + 64>], storeX4;
+10:-:-:-:1      STS [writeIs + 4x<3*128 + 64>], storeX3;
+--:-:-:-:1      STS [writeIs + 4x<2*128 + 64>], storeX2;
+--:-:-:-:1      STS [writeIs + 4x<1*128 + 64>], storeX1;
+--:1:-:-:1      STS [writeIs + 4x<0*128 + 64>], storeX0;
+
+--:-:-:-:1      PSETP.AND.AND P6, PT, P6, P1, PT;
+--:-:-:-:0  @P3 IADD.X track1I1,    track1I1, RZ;
+
+25:-:-:-:1      [+ convert() +] storeX7, loadE3.H1;
+--:-:-:-:1      [+ convert() +] storeX6, loadE3.H0;
+--:-:-:-:1      [+ convert() +] storeX5, loadE2.H1;
+--:-:3:-:1      [+ convert() +] storeX4, loadE2.H0;
+--:-:-:-:1      [+ convert() +] storeX3, loadE1.H1;
+--:-:-:-:1      [+ convert() +] storeX2, loadE1.H0;
+--:-:-:-:1      [+ convert() +] storeX1, loadE0.H1;
+--:-:6:-:1      [+ convert() +] storeX0, loadE0.H0;
+
+--:-:-:-:0  @P4 IADD   trackE0.CC, trackE0, 2x<32>;
+
+04:-:-:-:1      STS [writeEs + 4x<7*64>], storeX7;
+--:-:-:-:1      STS [writeEs + 4x<6*64>], storeX6;
+--:-:-:-:1      STS [writeEs + 4x<5*64>], storeX5;
+--:-:-:-:1      STS [writeEs + 4x<4*64>], storeX4;
+20:-:-:-:1      STS [writeEs + 4x<3*64>], storeX3;
+--:-:-:-:1      STS [writeEs + 4x<2*64>], storeX2;
+--:-:-:-:1      STS [writeEs + 4x<1*64>], storeX1;
+--:1:-:-:1      STS [writeEs + 4x<0*64>], storeX0;
+
+--:-:-:-:1  @P4 IADD.X trackE1,    trackE1, RZ;
+
+--:-:-:-:1      IADD readEs,  readEs, -swapBuf;
+--:-:-:-:0      IADD readIs,  readIs, -swapBuf;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:0  @P5 IADD q, q, param_grid_Q;
+--:-:-:Y:5  @P5 BRA.U NEXT_PQ;
+--:-:-:-:0  @P6 IADD p, p, param_grid_P;
+--:-:-:Y:5  @P6 BRA.U NEXT_PQ;
+
+--:-:-:-:2      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:Y:5      BRA.U INIT_LOOP;
+
+
+FINISH:
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID2;
+--:-:2:-:1      S2R blkI,    SR_CTAID.Y;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Z;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.X;
+--:-:-:-:5      BRA.U END_CTAID2;
+CTAID2:
+--:-:2:-:1      S2R blkI,    SR_CTAID.X;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Y;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.Z;
+END_CTAID2:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readEs,  readEs, -4x<szShareI>;
+--:-:-:-:1  @P0 IADD readIs,  readIs, -swapBuf;
+--:-:-:-:1  @P0 IADD readEs,  readEs, -swapBuf;
+
+// writeCs = (readIs / 4) * 64 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 4;
+
+
+// readCs = ((tid & 96) << 3) | (tid & 31)
+01:-:-:-:1      LOP.AND tid31, tid, 31;
+01:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 3;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+
+// kk = blkE*64 + tid31;
+04:-:-:-:1      ISCADD kk, blkE, tid31, 6;
+--:-:-:-:1      IADD   kk, kk, param_offset_K;
+
+// crst = blkI*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 crst00, tid96,  1;
+02:-:-:-:1      ISCADD  crst00, blkI,   crst00, 7;
+--:-:-:-:1      IADD    crst04, crst00, 4;
+--:-:-:-:1      IADD    crst08, crst00, 8;
+--:-:-:-:1      IADD    crst12, crst00, 12;
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1, K, 2;
+--:-:-:-:1      SHL K4, K, 4;
+--:-:-:-:1      ISCADD K60, K, -K4, 8;
+
+// trackF += crst*K + k;
+--:-:-:-:1      VMAD.U16.U16 tf, crst00, K, kk;
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSTK, param_CRSTK;
+08:-:-:-:1      XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ;
+        };
+    }
+    return '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 2;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P5, PT, kk, param_K, PT;
+--:-:-:-:1      IADD kk, kk, 32;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, PT;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   track04F0.CC, track00F0, K4;
+--:-:-:-:1      IADD.X track04F1,    track00F1, RZ;
+--:-:-:-:6      IADD   track08F0.CC, track04F0, K4;
+--:-:-:-:1      IADD.X track08F1,    track04F1, RZ;
+--:-:-:-:6      IADD   track12F0.CC, track08F0, K4;
+--:-:-:-:1      IADD.X track12F1,    track08F1, RZ;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   track00F0.CC, track00F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst00,       crst00,     60;\n" .
+            "--:-:-:-:1      IADD.X track00F1,    track00F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track04F0.CC, track04F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst04,       crst04,     60;\n" .
+            "--:-:-:-:1      IADD.X track04F1,    track04F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track08F0.CC, track08F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst08,       crst08,     60;\n" .
+            "--:-:-:-:1      IADD.X track08F1,    track08F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track12F0.CC, track12F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst12,       crst12,     60;\n" .
+            "--:-:-:-:1      IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL f0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL f7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
+--:-:-:-:1      IADD         crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
+--:-:-:-:1      IADD         crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
+--:-:-:-:1      IADD         crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
+--:-:-:-:0      IADD         crst12, crst12, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], f0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], f4;
+
+--:-:1:-:1      LDS f0, [readCs + 4x<0*64 + 00>];
+--:-:2:-:1      LDS f2, [readCs + 4x<1*64 + 00>];
+--:-:3:-:1      LDS f4, [readCs + 4x<2*64 + 00>];
+--:-:4:-:1      LDS f6, [readCs + 4x<3*64 + 00>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:-:-:-:1  @P0 STG.E.CG [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 STG.E.CG [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 STG.E.CG [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 STG.E.CG [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
+    else
+    {
+        return q{
+01:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
++]
+
+--:-:1:-:1      LDS f1, [readCs + 4x<0*64 + 32>];
+--:-:2:-:1      LDS f3, [readCs + 4x<1*64 + 32>];
+--:-:3:-:1      LDS f5, [readCs + 4x<2*64 + 32>];
+--:-:4:-:1      LDS f7, [readCs + 4x<3*64 + 32>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:1:-:-:1  @P0 STG.E.CG [track00F + 4x<32>], f1;
+02:2:-:-:1  @P1 STG.E.CG [track04F + 4x<32>], f3;
+04:3:-:-:1  @P2 STG.E.CG [track08F + 4x<32>], f5;
+08:4:-:-:1  @P3 STG.E.CG [track12F + 4x<32>], f7;
+        };
+    }
+    else
+    {
+        return q{
+01:1:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<32>], f1;
+02:2:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<32>], f3;
+04:3:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<32>], f5;
+08:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<32>], f7;
+        };
+    }
++]
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X128_N128.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X128_N128.sass
new file mode 100644
index 0000000..71bae4b
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/hconv_xprop_X128_N128.sass
@@ -0,0 +1,261 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix = 'h';
+    our $shareI = 128;
+    our $shareF = 128;
+    our $stepI  = 32;
+    our $stepF  = 64;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*8*2 + 128*8*2 + 0>
+    szShareF  : (128*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<128*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<128*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<128*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<128*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<128*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<128*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<128*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<128*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne
+     72-111 ~ tid1, tid128, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+
+    100-103 : loadI<0-1>, loadF<0-1>
+    104-107 : storeI<0-3>
+    104-107 : storeF<0-3>
+
+    108-111 ~ offsetF, offsetI, offsetFc, offsetIc
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    114-122 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset
+    123-127 ~ readFs, readIs, tid, idx_N
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-122  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidX = (tid & 31) << 2
+// tidY = tid >> 5
+--:-:-:-:1      LOP.AND tidX, tid,  31;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  5;
+
+// trackF += blkF*128 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 7;
+
+// trackI += blkI*128 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 7;
+
+// writeS = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+// readFs = ((tid & 112) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    112;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7)
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      SHR.U32 tid128, tid128, 3;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid128;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.64    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.64    loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.64    loadI, [addr_zero];
+
+12:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:2:-:2      [+ convert() +] storeF0, loadF0.H0;
+
+02:1:-:-:2      STS.128 [writeS], storeF;
+
+25:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:2:-:2      [+ convert() +] storeI0, loadI0.H0;
+
+02:1:-:-:1      STS.128 [writeS + 4x<szShareF>], storeI;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:3:-:1  @P1 LDG.E.64    loadI, [trackI];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c33 => "02:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c41 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c45 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c60 => "02:2:-:-:1  \@P0 STS.128 [writeS], storeF;\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "20:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c40 => "02:-:2:-:1  \@P1 LDG.E.CI.64 loadF, [trackF];\n",
+
+
+        j5c45 => "04:-:-:-:1  \@P0 $convert storeI3, loadI1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert storeI2, loadI1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert storeI1, loadI0.H1;\n",
+        j5c57 => "--:-:3:-:1  \@P0 $convert storeI0, loadI0.H0;\n",
+
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<szShareF>], storeI;\n",
+
+        j6c54 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j6c59 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j6c61 => "04:-:3:-:1  \@P1 LDG.E.64 loadI, [trackI];\n",
+
+        j6c62 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<szShareF + szShareI>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<szShareF + szShareI>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3 + (tid & 128) >> 1
+// tidOY = (tid & 127) >> 3
+--:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      SHL     tidOX,  tidOX,  3;
+--:-:-:-:1      LOP.AND tidOX2, tid,    128;
+--:-:-:-:1      SHR.U32 tidOX2, tidOX2, 1;
+--:-:-:-:1      LOP.OR  tidOX,  tidOX,  tidOX2;
+--:-:-:-:1      LOP.AND tidOY,  tid,    127;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      LOP.AND readIs, readIs, 0x1ff;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x0ff;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+--:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*128 + tidOY * 4
+--:-:-:-:1      SHL tidOY, tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 7;
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X128_N64.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X128_N64.sass
new file mode 100644
index 0000000..ce64717
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/hconv_xprop_X128_N64.sass
@@ -0,0 +1,284 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix = 'h';
+    our $shareI = 64;
+    our $shareF = 128;
+    our $stepI  = 32;
+    our $stepF  = 64;
+    our $remapF = 1;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 128*8*2 + 0>
+    szShareF  : (128*8)
+    szShareI  : (64*8)
+
+    addr_zero  : 4x<64*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-109 ~ tid1, tid15, tidFX, tidIX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-109 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+
+    100-103 : loadF<0-3>
+    100-103 : storeF<0-3>
+    104-107 : storeF<4-7>
+
+    108-109 : loadI<0-1>
+    104-107 : storeI<0-3>
+
+    104-107 ~ offsetF
+
+    110-111 : sliceI, sliceF
+    110-111 : sliceIF<0-1>
+
+    112-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc
+    125-127 ~ readFs, readIs, swapBuf
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-124  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidFX = (tid & 15) << 3
+// tidIX = (tid & 15) << 2
+// tidY = tid >> 4
+--:-:-:-:1      LOP.AND tid15, tid,  15;
+--:-:-:-:1      SHL     tidFX, tid15, 3;
+--:-:-:-:1      SHL     tidIX, tid15, 2;
+--:-:-:-:1      SHR.U32 tidY,  tid,   4;
+
+// trackF += blkF*128 + tidFX + offset_K
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidFX, 7;
+
+// trackI += blkI*64 + tidIX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidIX, 6;
+
+// Remap the FX dim to avoid bank conflicts when storing to shared
+
+// writeFs = (128*tidY + tidIX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidIX, 7;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// writeIs = (64*tidY + tidIX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidIX, 6;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs = ((tid & -16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = (tid >> 1) & 7
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.128    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.64 loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.64 loadI, [addr_zero];
+
+12:-:-:-:1      [+ convert() +] storeF7, loadF3.H1;
+--:-:-:-:1      [+ convert() +] storeF6, loadF3.H0;
+--:-:-:-:1      [+ convert() +] storeF5, loadF2.H1;
+--:-:1:-:1      [+ convert() +] storeF4, loadF2.H0;
+--:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:2:-:1      [+ convert() +] storeF0, loadF0.H0;
+
+01:-:-:-:1      STS.128 [writeFs + 4x<64>], storeF4;
+02:1:-:-:2      STS.128 [writeFs + 4x<00>], storeF0;
+
+25:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:3:-:2      [+ convert() +] storeI0, loadI0.H0;
+
+04:1:-:-:1      STS.128 [writeIs], storeI0;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF, [trackF];
+--:-:3:-:1  @P1 LDG.E.64     loadI, [trackI];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c29 => "02:-:-:-:1  \@P0 $convert storeF7, loadF3.H1;\n",
+        j1c33 => "--:-:-:-:1  \@P0 $convert storeF6, loadF3.H0;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF5, loadF2.H1;\n",
+        j1c41 => "--:-:5:-:1  \@P0 $convert storeF4, loadF2.H0;\n",
+        j1c45 => "--:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c49 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c53 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c57 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c59 => "10:5:-:-:1  \@P0 STS.128 [writeFs + 4x<64>], storeF4;\n",
+        j2c8  => "02:2:-:-:1  \@P0 STS.128 [writeFs + 4x<00>], storeF0;\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "30:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c40 => "02:-:2:-:1  \@P1 LDG.E.CI.128 loadF, [trackF];\n",
+
+        j5c45 => "04:-:-:-:1  \@P0 $convert storeI3, loadI1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert storeI2, loadI1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert storeI1, loadI0.H1;\n",
+        j5c57 => "--:-:3:-:1  \@P0 $convert storeI0, loadI0.H0;\n",
+
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeIs], storeI0;\n",
+
+        j6c55 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j6c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j6c62 => "04:-:3:-:1  \@P1 LDG.E.64 loadI, [trackI];\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "--:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3
+// tidOY = tid >> 3
+02:-:-:-:1      LOP.AND tidOX,  tid,   7;
+--:-:-:-:1      SHL     tidOX,  tidOX, 3;
+--:-:-:-:1      SHR.U32 tidOY,  tid,   3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 64 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs  = 4 * (tidOX + (tidOY * 64))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// Mul by 2 again to undo the bank conflict avoiding stride
+// k = blkF*128 + tidOY * 8
+--:-:-:-:1      SHL    tidOY,   tidOY, 3;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 7;
+
+[+ output_setup(63, 0, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X32_N128.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X32_N128.sass
new file mode 100644
index 0000000..e85f7d4
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/hconv_xprop_X32_N128.sass
@@ -0,0 +1,323 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix  = 'h';
+    our $shareI  = 128;
+    our $shareF  = 32;
+    our $stepI   = 32;
+    our $stepF   = 16;
+    our $remapI  = 1;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<32*8*2 + 128*8*2 + 0>
+    szShareF  : (32*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<32*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<32*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<32*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<32*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<32*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<32*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<32*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<32*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-69 : m, p, q
+      64-69 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne
+     70-113 ~ tid1, tidIX, tidFX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     70-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+
+    100-103 : load0I<0-3>
+    100-103 : store0I<0-3>
+    104-107 : store0I<4-7>
+
+    108-111 : load1I<0-3>
+    108-111 : store1I<0-3>
+    104-107 : store1I<4-7>
+
+    112-113 : loadF<0-1>
+    104-107 : storeF<0-3>
+
+    114-115 : sliceI, sliceF
+    114-115 : sliceIF<0-1>
+
+    116-140 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetF, offsetIc, offsetFc
+    141-155 ~ readFs, readIs, swapBuf, tid, idx_N, tid7, tid1_7, tid32, tid32_1
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-140  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+
+[+ get_mpqk() +]
+
+// tidIX = (tid & 7) << 3
+// tidFX = (tid & 7) << 2
+
+// tidY = tid >> 3
+--:-:-:-:1      LOP.AND tid7,  tid,  7;
+--:-:-:-:1      SHL     tidIX, tid7, 3;
+--:-:-:-:1      SHL     tidFX, tid7, 2;
+--:-:-:-:1      SHR.U32 tidY,  tid,  3;
+
+// trackF += blkF*32 + tidFX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidFX, 5;
+
+// trackI += blkI*128 + tidIX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidIX, 7;
+
+// writeFs = (32*tidY + tidFX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidFX, 5;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// Remap the IX dim to avoid bank conflicts when storing to shared
+
+// writeIs = (128*tidY + tidFX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidFX, 7;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs  = (((tid & 16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 32) >> 1) | ((tid >> 1) & 7) << 4
+--:-:-:-:1      LOP.AND tid32,   tid,   32;
+--:-:-:-:1      SHR.U32 tid32_1, tid32, 1;
+--:-:-:-:1      BFE.U32 tid1_7,  tid,   0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, tid1_7, tid32_1;
+--:-:-:-:1      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.64    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128 load0I, [trackI + 2x<00>];
+--:-:4:-:1  @P1 LDG.E.128 load1I, [trackI + 2x<64>];
+--:-:-:-:1 @!P1 LDS.U.128 load0I, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 load1I, [addr_zero];
+
+12:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:2:-:2      [+ convert() +] storeF0, loadF0.H0;
+
+02:1:-:-:2      STS.128 [writeFs], storeF0;
+
+25:-:-:-:1      [+ convert() +] store0I7, load0I3.H1;
+--:-:-:-:1      [+ convert() +] store0I6, load0I3.H0;
+--:-:-:-:1      [+ convert() +] store0I5, load0I2.H1;
+--:-:2:-:1      [+ convert() +] store0I4, load0I2.H0;
+--:-:-:-:1      [+ convert() +] store0I3, load0I1.H1;
+--:-:-:-:1      [+ convert() +] store0I2, load0I1.H0;
+--:-:-:-:1      [+ convert() +] store0I1, load0I0.H1;
+--:-:3:-:1      [+ convert() +] store0I0, load0I0.H0;
+
+02:-:-:-:1      STS.128 [writeIs + 4x<32>], store0I4;
+04:1:-:-:2      STS.128 [writeIs + 4x<00>], store0I0;
+
+09:-:-:-:1      [+ convert() +] store1I7, load1I3.H1;
+--:-:-:-:1      [+ convert() +] store1I6, load1I3.H0;
+--:-:-:-:1      [+ convert() +] store1I5, load1I2.H1;
+--:-:2:-:1      [+ convert() +] store1I4, load1I2.H0;
+--:-:-:-:1      [+ convert() +] store1I3, load1I1.H1;
+--:-:-:-:1      [+ convert() +] store1I2, load1I1.H0;
+--:-:-:-:1      [+ convert() +] store1I1, load1I0.H1;
+--:-:3:-:1      [+ convert() +] store1I0, load1I0.H0;
+
+02:-:-:-:1      STS.128 [writeIs + 4x<96>], store1I4;
+04:1:-:-:1      STS.128 [writeIs + 4x<64>], store1I0;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.64 loadF,  [trackF];
+--:-:3:-:1  @P1 LDG.E.128   load0I, [trackI + 2x<00>];
+--:5:4:-:1  @P1 LDG.E.128   load1I, [trackI + 2x<64>];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c33 => "02:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c41 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c45 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c60 => "02:-:-:-:1  \@P0 STS.128 [writeFs], storeF0;\n",
+
+        j1c62 => "--:-:2:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "02:-:-:-:1  \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.64 loadF0, [trackF];\n",
+
+
+        j3c29 => "04:-:-:-:1  \@P0 $convert store0I7, load0I3.H1;\n",
+        j3c33 => "--:-:-:-:1  \@P0 $convert store0I6, load0I3.H0;\n",
+        j3c37 => "--:-:-:-:1  \@P0 $convert store0I5, load0I2.H1;\n",
+        j3c41 => "--:-:6:-:1  \@P0 $convert store0I4, load0I2.H0;\n",
+        j3c45 => "--:-:-:-:1  \@P0 $convert store0I3, load0I1.H1;\n",
+        j3c49 => "--:-:-:-:1  \@P0 $convert store0I2, load0I1.H0;\n",
+        j3c53 => "--:-:-:-:1  \@P0 $convert store0I1, load0I0.H1;\n",
+        j3c57 => "--:-:3:-:1  \@P0 $convert store0I0, load0I0.H0;\n",
+
+        j3c59 => "20:-:-:-:1  \@P0 STS.128 [writeIs + 4x<32>], store0I4;\n",
+        j4c8  => "04:3:-:-:1  \@P0 STS.128 [writeIs + 4x<00>], store0I0;\n",
+
+        j4c50 => "10:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j4c55 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j4c61 => "04:-:3:-:1  \@P1 LDG.E.128 load0I0, [trackI + 2x<00>];\n",
+
+
+        j5c29 => "08:-:-:-:1  \@P0 $convert store1I7, load1I3.H1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 $convert store1I6, load1I3.H0;\n",
+        j5c37 => "--:-:-:-:1  \@P0 $convert store1I5, load1I2.H1;\n",
+        j5c41 => "--:-:6:-:1  \@P0 $convert store1I4, load1I2.H0;\n",
+        j5c45 => "--:-:-:-:1  \@P0 $convert store1I3, load1I1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert store1I2, load1I1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert store1I1, load1I0.H1;\n",
+        j5c57 => "--:-:4:-:1  \@P0 $convert store1I0, load1I0.H0;\n",
+
+        j5c59 => "20:-:-:-:1  \@P0 STS.128 [writeIs + 4x<96>], store1I4;\n",
+        j6c8  => "08:4:-:-:1  \@P0 STS.128 [writeIs + 4x<64>], store1I0;\n",
+
+        j6c61 => "08:5:4:-:1  \@P1 LDG.E.128 load1I0, [trackI + 2x<64>];\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "--:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3 + (tid & 32) << 1
+// tidOY = (tid & 31) >> 3
+--:-:-:-:1      SHL     tid32,  tid32, 1;
+--:-:-:-:1      ISCADD  tidOX,  tid7,  tid32, 3;
+--:-:-:-:1      LOP.AND tidOY,  tid,   31;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY, 3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+
+// readIs = ((tid & 32) >> 1) | (((tid >> 1) & 7) << 1) << 4
+--:-:-:-:1      ISCADD readIs, tid1_7, tid32_1, 1;
+--:-:-:-:1      SHL    readIs, readIs, 4;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+--:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*32 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+--:-:-:-:1      ISCADD k, idx_K, tidOY, 5;
+
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X64_N128.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X64_N128.sass
new file mode 100644
index 0000000..38f8183
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/hconv_xprop_X64_N128.sass
@@ -0,0 +1,293 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix = 'h';
+    our $shareI = 128;
+    our $shareF = 64;
+    our $stepI  = 64;
+    our $stepF  = 32;
+    our $remapI = 1;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 128*8*2 + 0>
+    szShareF  : (64*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<64*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-111 ~ tid1, tid15, tid64, tidIX, tidFX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+
+    100-103 : loadI<0-3>
+    100-103 : storeI<0-3>
+    104-107 : storeI<4-7>
+
+    108-109 : loadF<0-1>
+    104-107 : storeF<0-3>
+
+    110-111 : sliceI, sliceF
+    110-111 : sliceIF<0-1>
+
+    108-109 ~ offsetF
+
+    112-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc
+    125-127 ~ readFs, readIs, swapBuf
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-124  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidIX = (tid & 15) << 3
+// tidFX = (tid & 15) << 2
+// tidY = tid >> 4
+--:-:-:-:1      LOP.AND tid15, tid,  15;
+--:-:-:-:1      SHL     tidIX, tid15, 3;
+--:-:-:-:1      SHL     tidFX, tid15, 2;
+--:-:-:-:1      SHR.U32 tidY,  tid,   4;
+
+// trackF += blkF*64 + tidFX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidFX, 6;
+
+// trackI += blkI*128 + tidIX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidIX, 7;
+
+// writeFs = (64*tidY + tidFX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidFX, 6;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// Remap the IX dim to avoid bank conflicts when storing to shared
+
+// writeIs = (128*tidY + tidFX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidFX, 7;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs = ((tid & 48) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    48;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 64) >> 3) | ((tid >> 1) & 7)
+--:-:-:-:1      LOP.AND tid64,  tid,    64;
+--:-:-:-:1      SHR.U32 tid64,  tid64,  3;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid64;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.64    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128 loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero];
+
+12:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:2:-:2      [+ convert() +] storeF0, loadF0.H0;
+
+02:1:-:-:2      STS.128 [writeFs], storeF0;
+
+25:-:-:-:1      [+ convert() +] storeI7, loadI3.H1;
+--:-:-:-:1      [+ convert() +] storeI6, loadI3.H0;
+--:-:-:-:1      [+ convert() +] storeI5, loadI2.H1;
+--:-:2:-:1      [+ convert() +] storeI4, loadI2.H0;
+--:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:3:-:1      [+ convert() +] storeI0, loadI0.H0;
+
+02:-:-:-:1      STS.128 [writeIs + 4x<64>], storeI4;
+04:1:-:-:1      STS.128 [writeIs + 4x<00>], storeI0;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:3:-:1  @P1 LDG.E.128   loadI, [trackI];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c33 => "02:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c41 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c45 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c60 => "02:2:-:-:1  \@P0 STS.128 [writeFs], storeF0;\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "22:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.64 loadF, [trackF];\n",
+
+
+        j5c29 => "04:-:-:-:1  \@P0 $convert storeI7, loadI3.H1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 $convert storeI6, loadI3.H0;\n",
+        j5c37 => "--:-:-:-:1  \@P0 $convert storeI5, loadI2.H1;\n",
+        j5c41 => "--:-:6:-:1  \@P0 $convert storeI4, loadI2.H0;\n",
+        j5c45 => "--:-:-:-:1  \@P0 $convert storeI3, loadI1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert storeI2, loadI1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert storeI1, loadI0.H1;\n",
+        j5c57 => "--:-:3:-:1  \@P0 $convert storeI0, loadI0.H0;\n",
+
+        j5c59 => "20:-:-:-:1  \@P0 STS.128 [writeIs + 4x<64>], storeI4;\n",
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeIs + 4x<00>], storeI0;\n",
+
+        j6c55 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j6c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j6c62 => "04:-:3:-:1  \@P1 LDG.E.128 loadI, [trackI];\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "--:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3 + (tid & 64)
+// tidOY = (tid & 63) >> 3
+02:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      SHL     tidOX,  tidOX,  3;
+--:-:-:-:1      LOP.AND tidOX2, tid,    64;
+--:-:-:-:1      LOP.OR  tidOX,  tidOX,  tidOX2;
+--:-:-:-:1      LOP.AND tidOY,  tid,    63;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Expand back out to undo our bank conflict avoiding stride
+--:-:-:-:1      SHL readIs, readIs, 1;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*64 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 6;
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Maxwell/hconv_xprop_X64_N64.sass b/Kernel/Convolution/Maxwell/hconv_xprop_X64_N64.sass
new file mode 100644
index 0000000..16b92c5
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/hconv_xprop_X64_N64.sass
@@ -0,0 +1,290 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix = 'h';
+    our $shareI = 64;
+    our $shareF = 64;
+    our $stepI  = 32;
+    our $stepF  = 32;
+    our $remapF = 1;
+    our $remapI = 1;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 64*8*2 + 0>
+    szShareF  : (64*8)
+    szShareI  : (64*8)
+
+    addr_zero  : 4x<64*8*2 + 64*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 64*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 64*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 64*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 64*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 64*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 64*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 64*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-111 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-103 : loadI<0-3>
+    100-103 : storeI<0-3>
+    104-107 : storeI<4-7>
+
+    108-111 : loadF<0-3>
+    108-111 : storeF<0-3>
+    104-107 : storeF<4-7>
+
+    104-107 ~ offsetF
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    114-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc
+    126-127 ~ readFs, readIs
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-125  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+
+[+ get_mpqk() +]
+
+// tidX = (tid & 7) << 3
+// tidY = tid >> 3
+--:-:-:-:1      LOP.AND tidX, tid,  7;
+--:-:-:-:1      SHL     tidX, tidX, 3;
+--:-:-:-:1      SHR.U32 tidY, tid,  3;
+
+// trackF += blkF*64 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 6;
+
+// trackI += blkI*64 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 6;
+
+// Remap the X dim to avoid bank conflicts when storing to shared
+// We can unmap this in the output
+--:-:-:-:1      SHR.U32 tidX, tidX, 1;
+
+// writeS = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 6;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+// readFs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:0      SHL     readFs, readFs, 4;
+
+// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:1:-:1  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:5:-:1 @!P1 LDS.U.128    loadF0, [addr_zero];
+
+--:-:2:-:1  @P1 LDG.E.128 loadI0, [trackI];
+--:-:6:-:1 @!P1 LDS.U.128 loadI0, [addr_zero];
+
+11:-:-:-:1      [+ convert() +] storeF7, loadF3.H1;
+--:-:-:-:1      [+ convert() +] storeF6, loadF3.H0;
+--:-:-:-:1      [+ convert() +] storeF5, loadF2.H1;
+--:-:1:-:1      [+ convert() +] storeF4, loadF2.H0;
+--:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:5:-:1      [+ convert() +] storeF0, loadF0.H0;
+
+01:1:-:-:1      STS.128 [writeS + 4x<0*64 + 32>], storeF4;
+10:-:-:-:1      STS.128 [writeS + 4x<0*64 +  0>], storeF0;
+
+23:-:-:-:1      [+ convert() +] storeI7, loadI3.H1;
+--:-:-:-:1      [+ convert() +] storeI6, loadI3.H0;
+--:-:-:-:1      [+ convert() +] storeI5, loadI2.H1;
+--:-:1:-:1      [+ convert() +] storeI4, loadI2.H0;
+--:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:5:-:1      [+ convert() +] storeI0, loadI0.H0;
+
+01:-:-:-:1      STS.128 [writeS + 4x<8*64 + 32>], storeI4;
+10:1:-:-:1      STS.128 [writeS + 4x<8*64 +  0>], storeI0;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];
+--:-:3:-:1  @P1 LDG.E.128    loadI0, [trackI + 4x< 0>];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c20 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j1c25 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j1c31 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j1c32 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j1c18 => "02:-:-:-:1  \@P0 $convert storeF7, loadF3.H1;\n",
+        j1c22 => "--:-:-:-:1  \@P0 $convert storeF6, loadF3.H0;\n",
+        j1c26 => "--:-:-:-:1  \@P0 $convert storeF5, loadF2.H1;\n",
+        j1c30 => "--:-:5:-:1  \@P0 $convert storeF4, loadF2.H0;\n",
+        j1c33 => "--:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c41 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c45 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c47 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 + 32>], storeF4;\n",
+        j1c62 => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 +  0>], storeF0;\n",
+
+        j2c19 => "30:-:-:-:1  \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c24 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c26 => "--:-:-:-:1  \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c28 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c30 => "02:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF];\n",
+
+        j5c29 => "04:-:-:-:1  \@P0 $convert storeI7, loadI3.H1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 $convert storeI6, loadI3.H0;\n",
+        j5c37 => "--:-:-:-:1  \@P0 $convert storeI5, loadI2.H1;\n",
+        j5c41 => "--:-:5:-:1  \@P0 $convert storeI4, loadI2.H0;\n",
+        j5c45 => "--:-:-:-:1  \@P0 $convert storeI3, loadI1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert storeI2, loadI1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert storeI1, loadI0.H1;\n",
+        j5c57 => "--:-:3:-:1  \@P0 $convert storeI0, loadI0.H0;\n",
+
+        j5c59 => "10:-:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 + 32>], storeI4;\n",
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 +  0>], storeI0;\n",
+
+        j6c50 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j6c55 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j6c61 => "04:-:3:-:1  \@P1 LDG.E.128 loadI0, [trackI];\n",
+
+        j6c62 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3
+// tidOY = tid >> 3
+02:-:-:-:1      LOP.AND tidOX, tid,   7;
+--:-:-:-:1      SHL     tidOX, tidOX, 3;
+--:-:-:-:1      SHR.U32 tidOY, tid,   3;
+
+--:-:-:-:1      LOP.AND readIs, readIs, 0x7ff;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x7ff;
+
+// Expand back out to undo our bank conflict avoiding stride
+--:-:-:-:1      SHL readIs, readIs, 1;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 64 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs  = 4 * (tidOX + (tidOY * 64))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// Mul by 2 again to undo the bank conflict avoiding stride
+// k = blkF*64 + tidOY * 8
+--:-:-:-:1      SHL    tidOY,   tidOY, 3;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 6;
+
+[+ output_setup(63, 0, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Maxwell/persistent_rnn_bprop.sass b/Kernel/Convolution/Maxwell/persistent_rnn_bprop.sass
new file mode 100644
index 0000000..ddddb22
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/persistent_rnn_bprop.sass
@@ -0,0 +1,638 @@
+# Kernel: presistent_birnn
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(64*48)>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_d[0]        : c[0x0][0x140]
+    param_d[1]        : c[0x0][0x144]
+    param_dnext[0]    : c[0x0][0x148]
+    param_dnext[1]    : c[0x0][0x14c]
+    param_h[0]        : c[0x0][0x150]
+    param_h[1]        : c[0x0][0x154]
+    param_w[0]        : c[0x0][0x158]
+    param_w[1]        : c[0x0][0x15c]
+    param_lockAddr[0] : c[0x0][0x160]
+    param_lockAddr[1] : c[0x0][0x164]
+    param_ldd         : c[0x0][0x168]
+    param_ldh         : c[0x0][0x16c]
+    param_ldw         : c[0x0][0x170]
+    param_bsz         : c[0x0][0x174]
+    param_seqLength   : c[0x0][0x178]
+    param_numBlks     : c[0x0][0x17c]
+    param_rowSize     : c[0x0][0x180]
+    param_reverse     : c[0x0][0x184]
+    param_reluclip    : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+      0-215 : weight<000-215>
+    216-227 : accum<00-11>
+    228-231 : timeStep, warpTid, rowOffset, tid
+
+    232-235 : wAddr<0-1>, biasAddr<0-1>
+    236-254 ~ bid, ldw, wRow, loadRow, tidLsbs, tidMsbs, warpIndex, storeWeights, loadWeights, outRow, rowSize
+
+    232-249 : loadBuffer<0-3>, delta0r<0-3>, delta1r<0-3>, delta2r<0-3>, dnextAddr<0-1>
+    250-254 ~ loadDeltas, storeDeltas, loadIndex, dOffset, ldd
+
+    236-247 : peerR0V<0-3>, peerR1V<0-3>, peerR2V<0-3>
+    244     : hOffset
+    248-253 : h<0-3>, hAddr<0-1>
+
+    232-241 : output<0-3>, dAddr<0-1>, lockAddr<0-1>, expectVal, setVal
+    241-245 ~ storeIndex, hRow, predSave, lockVal, reluclip
+
+</REGISTER_MAPPING>
+
+//Get tid/block id
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:2:-:1      S2R bid, SR_CTAID.X;
+
+//Store zeros at addr_zero
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV     ldw,       param_ldw;
+--:-:-:-:1      MOV     rowSize,   param_rowSize;
+
+//timeStep = (param_reverse == 0) ? 0 : param_seqLength
+--:-:-:-:1      ISETP.EQ.AND P2, PT, RZ, param_reverse, PT;
+--:-:-:-:1      SEL timeStep, RZ, param_seqLength, P2;
+--:-:-:-:1 @!P2 IADD timeStep, timeStep, -1;
+
+//warpIndex = threadIdx.x >> 5
+01:-:-:-:1      SHR.U32 warpIndex, tid, 5;
+
+//warpTid = threadIdx.x & 0x1f
+01:-:-:-:1      LOP.AND warpTid,   tid, 0x1f;
+
+//rowOffset = ((blockIdx.x << 3) + warp_index) * 6
+02:-:-:-:1      SHL     rowOffset, bid,       3;
+--:-:-:-:1      IADD    rowOffset, rowOffset, warpIndex;
+--:-:-:-:1      XMAD    rowOffset, rowOffset, 6, RZ;
+
+//if(warp_tid > 15) rowOffset += 3
+--:-:-:-:1      ISETP.GT.AND P1, PT, warpTid, 15, PT;
+--:-:-:-:1  @P1 IADD     rowOffset, rowOffset, 3;
+
+//warpTid = warpTid & 0x0f
+--:-:-:-:1      LOP.AND  warpTid, warpTid, 0x0f;
+--:-:-:-:1      ISETP.LT.AND P0, PT, warpTid, 3, PT;
+--:-:-:-:1      IADD     outRow, rowOffset, warpTid;
+--:-:-:-:1      ISETP.LT.AND P0, PT, outRow, param_rowSize, P0;
+
+//storeWeights = (((tid >> 2) * 48) + ((tid & 3) << 2)) << 2
+//wRow = ((tid >> 2) * ldw) + ((tid & 3) << 2) + (bid * 48)
+--:-:-:-:1      LOP.AND tidLsbs, warpTid, 0x03;
+--:-:-:-:1      SHR     tidMsbs, tid, 2;
+--:-:-:-:1      SHL     tidLsbs, tidLsbs, 2;
+
+--:-:-:-:1      XMAD    loadRow, bid, 48, tidLsbs;
+--:-:-:-:1      XMAD    wRow, tidMsbs, ldw, loadRow;
+
+--:-:-:-:1      XMAD    storeWeights, tidMsbs, 48, tidLsbs;
+--:-:-:-:1      SHL     storeWeights, storeWeights, 2;
+
+//loadWeights = (((warpTid * 8) + warpIndex) * 6) + (P1 ? 3 : 0)) << 2
+--:-:-:-:1      XMAD    loadWeights, warpTid, 8, warpIndex;
+--:-:-:-:1      XMAD    loadWeights, loadWeights, 6, RZ;
+--:-:-:-:1  @P1 IADD    loadWeights, loadWeights, 3;
+--:-:-:-:1      SHL     loadWeights, loadWeights, 2;
+
+//wAddr = &w[wRow]
+--:-:-:-:1      LEA      wAddr0.CC, wRow, param_w[0],     2;
+--:-:-:-:1      LEA.HI.X wAddr1,    wRow, param_w[1], RZ, 2;
+
+//ldw = ldw << 6
+--:-:-:-:1      SHL      ldw,  ldw,       8;
+
+//Compute row loading predicates
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidMsbs, rowSize, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, loadRow, rowSize, P1;
+--:-:-:-:1      IADD     rowSize, rowSize, -16;
+--:-:-:-:1      ISETP.LT.AND P4, PT, loadRow, rowSize, P1;
+--:-:-:-:1      IADD     rowSize, rowSize, -16;
+--:-:-:-:1      ISETP.LT.AND P5, PT, loadRow, rowSize, P1;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:c NOP;
+
+//Load weights to registers
+<CODE>
+    my $out;
+    my $regId = 0;
+    my $rowsize = 1152;
+
+    for (my $col=0; $col < $rowsize; $col += 64)
+    {
+        $out .= "--:-:-:-:1      IADD tidMsbs, tidMsbs, 64;\n";
+
+        #Use vector loads from weight matrix
+        $regId = $col / 16;
+        $out .= sprintf "--:-:1:-:1  \@P3 LDG.E.128 weight%03d, [wAddr];\n", $regId;
+        $out .= sprintf "--:-:1:-:1 \@!P3 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+        $regId = $col / 16 + 72;
+        $out .= sprintf "--:-:2:-:1  \@P4 LDG.E.128 weight%03d, [wAddr + 4x<16>];\n", $regId;
+        $out .= sprintf "--:-:2:-:1 \@!P4 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+        $regId = $col / 16 + 144;
+        $out .= sprintf "--:-:3:-:1  \@P5 LDG.E.128 weight%03d, [wAddr + 4x<32>];\n", $regId;
+        $out .= sprintf "--:-:3:-:1 \@!P5 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+
+        $out .= "--:-:-:-:1      ISETP.LT.AND P3, PT, tidMsbs, param_rowSize, P3;\n";
+        $out .= "--:-:-:-:1      ISETP.LT.AND P4, PT, tidMsbs, param_rowSize, P4;\n";
+        $out .= "--:-:-:-:1      ISETP.LT.AND P5, PT, tidMsbs, param_rowSize, P5;\n";
+
+        #Store weights into shared memory
+        if ($col > 0)
+        {
+            $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+        }
+
+        $regId = $col / 16;
+        $out .= sprintf "01:-:-:-:1      STS.U.128 [storeWeights], weight%03d;\n", $regId;
+        $regId = $col / 16 + 72;
+        $out .= sprintf "02:-:-:-:1      STS.U.128 [storeWeights + 4x<16>], weight%03d;\n", $regId;
+        $regId = $col / 16 + 144;
+        $out .= sprintf "04:-:-:-:1      STS.U.128 [storeWeights + 4x<32>], weight%03d;\n", $regId;
+
+        $out .= "--:-:-:-:6      IADD   wAddr0.CC, wAddr0, ldw;\n";
+        $out .= "--:-:-:-:1      IADD.X wAddr1,    wAddr1, RZ;\n\n";
+
+        #Load each weight from shared mem
+        $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+
+        foreach my $row (0 .. 2)
+        {
+            foreach my $shared_col (0 .. 3)
+            {
+                my $control;
+
+                if (($col + 64) >= $rowsize && $row == 2 && $shared_col == 3)
+                {
+                    $control = "--:1:6:-:2";
+                }
+                else
+                {
+                    $control = "--:-:-:-:1";
+                }
+
+                $regId = ($row * 72) + ($col / 16) + $shared_col;
+                my $shared_offset = $row + ($shared_col * 16 * 48);
+                $out .= sprintf "%s      LDS.U weight%03d, [loadWeights + 4x<%d>];\n", $control, $regId, $shared_offset;
+            }
+        }
+    }
+
+    $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+
+    return $out;
+
+</CODE>
+
+//Predicates for store code
+--:-:-:-:1      ISETP.EQ.AND P2, PT, warpTid, 0, PT;
+--:-:-:-:1      ISETP.EQ.AND P3, PT, warpTid, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P4, PT, warpTid, 2, PT;
+
+UNROLLING_LOOP:
+<SCHEDULE_BLOCK>
+//Prime inner product loop by loading first rows of dnext
+--:-:-:-:1      MOV loadIndex,    tid;
+
+//storeDeltas = tid << 4
+--:-:-:-:1      SHL storeDeltas, tid, 4;
+--:-:-:-:1      SHL loadDeltas, warpTid, 4;
+
+//dnextAddr = &d_next[timeStep * ldd + loadIndex]
+--:-:-:-:1      XMAD     dOffset,        loadIndex, param_ldd,      timeStep;
+--:-:-:-:1      LEA      dnextAddr0.CC,  dOffset,   param_dnext[0],     4;
+01:-:-:-:2      LEA.HI.X dnextAddr1,     dOffset,   param_dnext[1], RZ, 4;
+
+//loadBuffer = *dnextAddr
+--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;
+--:5:1:-:2  @P1 LDG.E.CI.128 loadBuffer, [dnextAddr];
+--:5:1:-:2 @!P1 LDS.U.CI.128 loadBuffer, [addr_zero];
+
+//ldd = param_ldd << 12
+--:-:-:-:1      MOV ldd, param_ldd;
+--:-:-:-:1      SHL ldd, ldd, 12;
+</SCHEDULE_BLOCK>
+
+//Initialize all accumulation registers to 0
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 accum%02d, [addr_zero];\n", $_ * 4), 0..2;
+</CODE>
+
+//Update load index and load address
+--:-:-:-:6      IADD loadIndex, loadIndex, 256;
+--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;
+10:-:-:-:6      IADD   dnextAddr0.CC, dnextAddr0, ldd;
+--:-:-:-:6      IADD.X dnextAddr1,    dnextAddr1, RZ;
+
+01:-:-:-:1      STS.U.128 [storeDeltas], loadBuffer;
+
+//Unrolled GEMM loop
+<CODE>
+    our @top;
+
+    my $out = join '', @top;
+
+    my $rowsize = 1152;
+    my $weight_index = 0;
+
+    my $wait_flag = 2;
+    my $set_flag = 4;
+    my $read_buffer = 0;
+    my $write_buffer = 2;
+
+    for (my $k=0; $k < $rowsize; $k+=256)
+    {
+        if ($k == 0)
+        {
+            $out .= "--:6:1:-:1  \@P1 LDG.E.CI.128 loadBuffer, [dnextAddr];\n";
+            $out .= "--:-:1:-:1 \@!P1 LDS.U.128    loadBuffer, [addr_zero];\n\n";
+            $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+            $out .= "--:-:2:-:1      LDS.U.128 delta0r, [loadDeltas];\n";
+            $out .= "--:-:3:-:1      LDS.U.128 delta1r, [loadDeltas + 4x<4*16>];\n\n";
+        }
+        $out .= "--:-:-:-:1      LOP.XOR storeDeltas, storeDeltas, 4096;\n";
+
+        foreach my $shared_row (0 .. 15)
+        {
+            if($weight_index < 72)
+            {
+                if ($shared_row < 14 && ($k + (16 * ($shared_row + 2))) < $rowsize)
+                {
+                    my $read_bar = "-";
+                    if ($shared_row == 13 && ($k + 256) < $rowsize)
+                    {
+                        $read_bar = "5";
+                    }
+                    $out .= sprintf "--:%s:%d:-:1      LDS.U.128 delta%dr, [loadDeltas + 4x<4*%d>];\n", $read_bar, $set_flag, $write_buffer, (16 * ($shared_row + 2));
+                }
+
+                if ($shared_row == 11 && ($k + 512) < $rowsize)
+                {
+                    $out .= "--:-:-:-:1      IADD loadIndex, loadIndex, 256;\n";
+                    $out .= "20:-:-:-:1      IADD dnextAddr0.CC, dnextAddr0, ldd;\n";
+                }
+
+                if ($shared_row == 12 && ($k + 512) < $rowsize)
+                {
+                    $out .= "--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;\n";
+                    $out .= "--:-:-:-:1      IADD.X dnextAddr1,    dnextAddr1, RZ;\n";
+                }
+
+                if ($shared_row == 13)
+                {
+                    $out .= "01:-:-:-:1      STS.U.128 [storeDeltas], loadBuffer;\n";
+
+                    if(($k + 512) < $rowsize)
+                    {
+                        $out .= "--:6:1:-:1  \@P1 LDG.E.CI.128 loadBuffer, [dnextAddr];\n";
+                        $out .= "--:-:1:-:1 \@!P1 LDS.U.128    loadBuffer, [addr_zero];\n\n";
+                    }
+                    else
+                    {
+                        $out .= "--:-:-:-:6      IADD     dOffset,        rowOffset, warpTid;\n";
+                        $out .= "--:-:-:-:6      XMAD     dOffset,        dOffset,   param_ldd,  timeStep;\n";
+                        $out .= "--:-:-:-:6      LEA      dnextAddr0.CC,  dOffset,   param_d[0],      4;\n";
+                        $out .= "--:-:-:-:2      LEA.HI.X dnextAddr1,     dOffset,   param_d[1], RZ, 4;\n";
+                        $out .= "--:-:6:-:1 \@P0 LDG.E.CI.128 loadBuffer, [dnextAddr];\n\n";
+                    }
+                }
+
+                if ($shared_row == 14 && ($k + 256) < $rowsize)
+                {
+                    $out .= "10:-:-:-:1      LOP.XOR loadDeltas, loadDeltas, 4096;\n";
+                    $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+                    $out .= sprintf "--:-:%d:-:1      LDS.U.128 delta%dr, [loadDeltas];\n", $set_flag, $write_buffer;
+                }
+
+                if ($shared_row == 15 && ($k + 256) < $rowsize)
+                {
+                    $out .= sprintf "--:-:%d:-:1      LDS.U.128 delta%dr, [loadDeltas + 4x<4*16>];\n\n", $set_flag, $write_buffer;
+                }
+
+                foreach my $row (0 .. 2)
+                {
+                    my $weight = ($row * 72) + $weight_index;
+
+                    foreach my $col (0 .. 3)
+                    {
+                        my $accum = ($row * 4) + $col;
+                        my $wait = "--";
+                        my $stall = 1;
+                        if ($accum == 0)
+                        {
+                            if ($weight_index == 0)
+                            {
+                                $wait = sprintf "%02x", (0x20 | (1 << ($wait_flag - 1)));
+                            }
+                            else
+                            {
+                                $wait = sprintf "%02x", (1 << ($wait_flag - 1));
+                            }
+                        }
+
+                        if ($row == 2 && $col == 3)
+                        {
+                            if ($shared_row < 13 && ($k + (16 * ($shared_row + 3))) < $rowsize)
+                            {
+                                $stall = 0;
+                            }
+                            elsif ($shared_row == 14 && ($k + 256) < $rowsize)
+                            {
+                                $stall = 0;
+                            }
+                        }
+
+                        $out .= sprintf "%s:-:-:-:%d      FFMA accum%02d, weight%03d, delta%dr%d, accum%02d;\n", $wait, $stall, $accum, $weight, $read_buffer, $col, $accum;
+                    }
+                }
+
+                $weight_index++;
+            }
+
+            $wait_flag += 1;
+            $set_flag += 1;
+            $read_buffer += 1;
+            $write_buffer += 1;
+            if($wait_flag == 5)
+            {
+                $wait_flag = 2;
+            }
+            if($set_flag == 5)
+            {
+                $set_flag = 2;
+            }
+            if($read_buffer == 3)
+            {
+                $read_buffer = 0;
+            }
+            if($write_buffer == 3)
+            {
+                $write_buffer = 0;
+            }
+        }
+    }
+
+    return $out;
+</CODE>
+
+//Load hidden states
+--:-:-:-:6      IADD     hOffset,    rowOffset, warpTid;
+--:-:-:-:6      XMAD     hOffset,    hOffset,   param_ldh,  timeStep;
+--:-:-:-:6      LEA      hAddr0.CC,  hOffset,   param_h[0],      4;
+--:-:-:-:2      LEA.HI.X hAddr1,     hOffset,   param_h[1], RZ, 4;
+--:-:5:-:1 @P0  LDG.E.CI.128 h, [hAddr];
+
+//Reduction between threads
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 1, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 1, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 1, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 1, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 1, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 2, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 2, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 2, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 2, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 2, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 2, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 2, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 2, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 4, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 4, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 4, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 4, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 4, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 4, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 4, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 4, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 8, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 8, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 8, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 8, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 8, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 8, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 8, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 8, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:5      MOV reluclip, param_reluclip;
+
+//Compute store pointer
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD     hRow,       rowOffset,  warpTid;
+--:-:-:-:1      XMAD     storeIndex, hRow,       param_ldd,  timeStep;
+--:-:-:-:1      LEA      dAddr0.CC,  storeIndex, param_d[0],      4;
+--:-:-:-:1      LEA.HI.X dAddr1,     storeIndex, param_d[1], RZ, 4;
+--:-:-:-:1      LEA      lockAddr0,  timeStep,   param_lockAddr[0], 2;
+--:-:-:-:1      LEA.HI.X lockAddr1,  timeStep,   param_lockAddr[1], RZ, 2;
+
+//Conditional select for output
+//TODO: make sure scheduler orders these such that first one waits on barrier
+20:-:-:-:1  @P2 FADD output0, output0, accum00;
+20:-:-:-:1  @P3 FADD output0, output0, accum04;
+20:-:-:-:1  @P4 FADD output0, output0, accum08;
+
+20:-:-:-:1  @P2 FADD output1, output1, accum01;
+20:-:-:-:1  @P3 FADD output1, output1, accum05;
+20:-:-:-:1  @P4 FADD output1, output1, accum09;
+
+20:-:-:-:1  @P2 FADD output2, output2, accum02;
+20:-:-:-:1  @P3 FADD output2, output2, accum06;
+20:-:-:-:1  @P4 FADD output2, output2, accum10;
+
+20:-:-:-:1  @P2 FADD output3, output3, accum03;
+20:-:-:-:1  @P3 FADD output3, output3, accum07;
+20:-:-:-:3  @P4 FADD output3, output3, accum11;
+</SCHEDULE_BLOCK>
+
+//Save select predicates
+//TODO: how many stall cycles needed here?
+--:-:-:-:6      P2R predSave, PR, RZ, 0x1e;
+
+//Multiply by bprop for reclinclip activation function
+//TODO: others
+<SCHEDULE_BLOCK>
+10:-:-:-:1      FSETP.LT.AND P2, PT, RZ, h0, PT;
+10:-:-:-:1      FSETP.LT.AND P3, PT, RZ, h1, PT;
+10:-:-:-:1      FSETP.LT.AND P4, PT, RZ, h2, PT;
+10:-:-:-:1      FSETP.LT.AND P5, PT, RZ, h3, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, h0, reluclip, P2;
+--:-:-:-:1      FSETP.LT.AND P3, PT, h1, reluclip, P3;
+--:-:-:-:1      FSETP.LT.AND P4, PT, h2, reluclip, P4;
+--:-:-:-:1      FSETP.LT.AND P5, PT, h3, reluclip, P5;
+--:-:-:-:1 @!P2 FMUL output0, output0, RZ;
+--:-:-:-:1 @!P3 FMUL output1, output1, RZ;
+--:-:-:-:1 @!P4 FMUL output2, output2, RZ;
+--:-:-:-:1 @!P5 FMUL output3, output3, RZ;
+
+//Update timestep
+--:-:-:-:1      ISETP.EQ.AND P1, PT, RZ, param_reverse, PT;
+--:-:-:-:1  @P1 MOV setVal, 1;
+--:-:-:-:1 @!P1 MOV setVal, -1;
+--:-:-:-:1  @P1 MOV expectVal, param_seqLength;
+--:-:-:-:1 @!P1 MOV expectVal, -1;
+--:-:-:-:1      IADD timeStep, timeStep, setVal;
+</SCHEDULE_BLOCK>
+
+//Conditional store
+--:-:-:-:5  @P0 STG.E.CI.128 [dAddr], output;
+
+//Compute predicate for time unrolling loop
+--:-:-:Y:d      ISETP.NE.AND P5, PT, timeStep, expectVal, PT;
+
+//P2 = (tid != 0)
+//setVal = 1
+--:-:-:-:1      ISETP.NE.AND P2, PT, tid, RZ, PT;
+--:-:-:-:1      MOV expectVal, param_numBlks;
+--:-:-:Y:b      MOV setVal, 1;
+
+//Barrier for all blocks
+--:-:-:-:f      MEMBAR.GL;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:2      SSY SSY_TARGET1;
+--:-:-:-:d  @P2 SYNC;
+
+--:-:-:Y:2      ATOM.E.ADD RZ, [lockAddr], setVal;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET1:
+--:-:-:-:1      SSY SSY_TARGET2;
+--:-:-:-:d  @P2 SYNC;
+
+SPINLOCK:
+--:-:1:Y:2      LDG.E lockVal, [lockAddr];
+01:-:-:Y:d      ISETP.NE.AND P2, PT, lockVal, expectVal, PT;
+--:-:-:-:5  @P2 BRA.U SPINLOCK;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET2:
+--:-:-:-:5      BAR.SYNC 0;
+
+//Restore select predicates
+--:-:-:-:1      R2P PR, predSave, 0x1e;
+
+//Conditional branch back to beginning of loop
+--:-:-:Y:5  @P5 BRA.U UNROLLING_LOOP;
+
+--:-:-:-:5      EXIT;
diff --git a/Kernel/Convolution/Maxwell/persistent_rnn_fprop.sass b/Kernel/Convolution/Maxwell/persistent_rnn_fprop.sass
new file mode 100644
index 0000000..6a11539
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/persistent_rnn_fprop.sass
@@ -0,0 +1,653 @@
+# Kernel: presistent_birnn
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(64*48)>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_h[0]        : c[0x0][0x140]
+    param_h[1]        : c[0x0][0x144]
+    param_hprev[0]    : c[0x0][0x148]
+    param_hprev[1]    : c[0x0][0x14c]
+    param_bias[0]     : c[0x0][0x150]
+    param_bias[1]     : c[0x0][0x154]
+    param_w[0]        : c[0x0][0x158]
+    param_w[1]        : c[0x0][0x15c]
+    param_lockAddr[0] : c[0x0][0x160]
+    param_lockAddr[1] : c[0x0][0x164]
+    param_ldh         : c[0x0][0x168]
+    param_ldw         : c[0x0][0x16c]
+    param_bsz         : c[0x0][0x170]
+    param_seqLength   : c[0x0][0x174]
+    param_numBlks     : c[0x0][0x178]
+    param_rowSize     : c[0x0][0x17c]
+    param_reverse     : c[0x0][0x180]
+    param_reluclip    : c[0x0][0x184]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+      0-215 : weight<000-215>
+    216-227 : accum<00-11>
+    228-229 : timeStep, biasValue
+    230-232 : warpTid, rowOffset, tid
+
+    233     : bid
+
+    236-243 : wAddr0r<0-1>, wAddr1r<0-1>, wAddr2r<0-1>, biasAddr<0-1>
+    244-254 ~ ldw, wRow, warpTid4, loadRow, warpIndex, storeWeights, loadWeights, rowSize
+
+    233     : hOffset
+    233     : ldh
+    234-239 : hprevAddr<0-1>, loadBuffer<0-3>
+    240-251 : hidden0r<0-3>, hidden1r<0-3>, hidden2r<0-3>
+    252-254 ~ loadHiddens, storeHiddens, loadIndex
+
+    240-251 : peerR0V<0-3>, peerR1V<0-3>, peerR2V<0-3>
+
+    240-249 : output<0-3>, hAddr<0-1>, lockAddr<0-1>, expectVal, setVal
+    250-254 ~ storeIndex, hRow, predSave, lockVal, reluclip
+
+</REGISTER_MAPPING>
+
+//Get tid/block id
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:2:-:1      S2R bid, SR_CTAID.X;
+
+//Store zeros at addr_zero
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV     ldw,       param_ldw;
+--:-:-:-:1      MOV     rowSize,   param_rowSize;
+
+//timeStep = (param_reverse == 0) ? 0 : param_seqLength
+--:-:-:-:1      ISETP.EQ.AND P2, PT, RZ, param_reverse, PT;
+--:-:-:-:1      SEL timeStep, RZ, param_seqLength, P2;
+--:-:-:-:1 @!P2 IADD timeStep, timeStep, -1;
+
+//warpIndex = threadIdx.x >> 5
+01:-:-:-:1      SHR.U32 warpIndex, tid, 5;
+
+//warpTid = threadIdx.x & 0x1f
+01:-:-:-:1      LOP.AND warpTid,   tid, 0x1f;
+
+//rowOffset = ((blockIdx.x << 3) + warp_index) * 6
+02:-:-:-:1      SHL     rowOffset, bid,       3;
+--:-:-:-:1      IADD    rowOffset, rowOffset, warpIndex;
+--:-:-:-:1      XMAD    rowOffset, rowOffset, 6, RZ;
+
+//if(warp_tid > 15) rowOffset += 3
+--:-:-:-:1      ISETP.GT.AND P1, PT, warpTid, 15, PT;
+--:-:-:-:1  @P1 IADD     rowOffset, rowOffset, 3;
+
+//warpTid = warpTid & 0x0f
+--:-:-:-:1      LOP.AND  warpTid, warpTid, 0x0f;
+--:-:-:-:1      ISETP.LT.AND P0, PT, warpTid, 3, PT;
+
+//warpTid4 = warpTid << 2
+--:-:-:-:1      SHL      warpTid4, warpTid, 2;
+
+//storeWeights = ((P1) ? (warpTid4 + 3*64) : warpTid4) << 2
+//loadWeights = ((P1) ? (warpTid + 3*64) : warpTid) << 2
+--:-:-:-:1  @P1 MOV      loadWeights, 3;
+--:-:-:-:1 @!P1 MOV      loadWeights, RZ;
+
+--:-:-:-:1      XMAD     loadWeights, warpIndex, 6, loadWeights;
+--:-:-:-:1      SHL      loadWeights, loadWeights, 6;
+
+--:-:-:-:1      IADD     storeWeights, loadWeights, warpTid4;
+--:-:-:-:1      IADD     loadWeights, loadWeights, warpTid;
+--:-:-:-:1      SHL      storeWeights, storeWeights, 2;
+--:-:-:-:1      SHL      loadWeights, loadWeights, 2;
+
+//wRow = rowOffset * ldw + warpTid
+--:-:-:-:1      XMAD     wRow, rowOffset, ldw, warpTid4;
+
+//wAddr0r = &w[wRow]
+--:-:-:-:1      LEA      wAddr0r0.CC, wRow, param_w[0],     2;
+--:-:-:-:1      LEA.HI.X wAddr0r1,    wRow, param_w[1], RZ, 2;
+
+//ldw = ldw << 2
+--:-:-:-:1      SHL      ldw,  ldw,       2;
+
+//wAddr1r = wAddr0r + ldw
+--:-:-:-:1      IADD     wAddr1r0.CC, wAddr0r0, ldw;
+--:-:-:-:1      IADD.X   wAddr1r1,    wAddr0r1, RZ;
+
+//wAddr2r = wAddr2r + ldw
+--:-:-:-:1      IADD     wAddr2r0.CC, wAddr1r0, ldw;
+--:-:-:-:1      IADD.X   wAddr2r1,    wAddr1r1, RZ;
+
+//Compute row loading predicates
+--:-:-:-:1      ISETP.LT.AND P1, PT, warpTid4, rowSize, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, rowOffset, rowSize, P1;
+--:-:-:-:1      IADD     rowSize, rowSize, -1;
+--:-:-:-:1      ISETP.LT.AND P4, PT, rowOffset, rowSize, P1;
+--:-:-:-:1      IADD     rowSize, rowSize, -1;
+--:-:-:-:1      ISETP.LT.AND P5, PT, rowOffset, rowSize, P1;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:c      NOP;
+
+//Load weights to registers
+<CODE>
+    my $out;
+    my $regId = 0;
+    my $rowsize = 1152;
+
+    for (my $col=0; $col < $rowsize; $col += 64)
+    {
+        $out .= "--:-:-:-:1      IADD warpTid4, warpTid4, 64;\n";
+
+        #Use vector loads from weight matrix
+        $regId = $col / 16;
+        $out .= sprintf "--:-:1:-:1  \@P3 LDG.E.128 weight%03d, [wAddr0r + 4x<%d>];\n", $regId, $col;
+        $out .= sprintf "--:-:1:-:1 \@!P3 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+        $regId = $col / 16 + 72;
+        $out .= sprintf "--:-:2:-:1  \@P4 LDG.E.128 weight%03d, [wAddr1r + 4x<%d>];\n", $regId, $col;
+        $out .= sprintf "--:-:2:-:1 \@!P4 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+        $regId = $col / 16 + 144;
+        $out .= sprintf "--:-:3:-:1  \@P5 LDG.E.128 weight%03d, [wAddr2r + 4x<%d>];\n", $regId, $col;
+        $out .= sprintf "--:-:3:-:1 \@!P5 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+
+        $out .= "--:-:-:-:1      ISETP.LT.AND P3, PT, warpTid4, rowSize, P3;\n";
+        $out .= "--:-:-:-:1      ISETP.LT.AND P4, PT, warpTid4, rowSize, P4;\n";
+        $out .= "--:-:-:-:1      ISETP.LT.AND P5, PT, warpTid4, rowSize, P5;\n";
+
+        #Store weights into shared memory
+        if ($col > 0)
+        {
+            $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+        }
+
+        $regId = $col / 16;
+        $out .= sprintf "01:-:-:-:1      STS.U.128 [storeWeights], weight%03d;\n", $regId;
+        $regId = $col / 16 + 72;
+        $out .= sprintf "02:-:-:-:1      STS.U.128 [storeWeights + 4x<64>], weight%03d;\n", $regId;
+        $regId = $col / 16 + 144;
+        $out .= sprintf "04:-:-:-:1      STS.U.128 [storeWeights + 4x<128>], weight%03d;\n", $regId;
+
+        #Load each weight from shared mem
+        $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+
+        foreach my $shared_col (0 .. 3)
+        {
+            foreach my $row (0 .. 2)
+            {
+                my $control;
+
+                if (($col + 64) >= $rowsize && $row == 2 && $shared_col == 3)
+                {
+                    $control = "--:1:6:-:2";
+                }
+                else
+                {
+                    $control = "--:-:-:-:1";
+                }
+
+                $regId = ($row * 72) + ($col / 16) + $shared_col;
+                my $shared_offset = ($row * 64) + ($shared_col * 16);
+                $out .= sprintf "%s      LDS.U weight%03d, [loadWeights + 4x<%d>];\n", $control, $regId, $shared_offset;
+            }
+        }
+    }
+
+    $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+
+    return $out;
+
+</CODE>
+
+//Conditional load of bias
+<SCHEDULE_BLOCK>
+01:-:-:-:1      IADD     loadRow,      rowOffset, warpTid;
+--:-:-:-:1      ISETP.LT.AND P0, PT, loadRow, param_rowSize, P0;
+--:-:-:-:1      LEA      biasAddr0.CC, loadRow,   param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X biasAddr1,    loadRow,   param_bias[1], RZ, 2;
+--:-:-:-:1  @P0 LDG.E    biasValue,    [biasAddr];
+--:-:-:-:1 @!P0 MOV      biasValue,    RZ;
+</SCHEDULE_BLOCK>
+
+//Predicates for store code
+--:-:-:-:1      ISETP.EQ.AND P2, PT, warpTid, 0, PT;
+--:-:-:-:1      ISETP.EQ.AND P3, PT, warpTid, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P4, PT, warpTid, 2, PT;
+
+UNROLLING_LOOP:
+<SCHEDULE_BLOCK>
+//Prime inner product loop by loading first rows of hprev
+--:-:-:-:1      MOV loadIndex,    tid;
+
+//storeHiddens = tid << 4
+--:-:-:-:1      SHL storeHiddens, tid, 4;
+--:-:-:-:1      SHL loadHiddens, warpTid, 4;
+
+//hprevAddr = &h_prev[timeStep * ldh + loadIndex]
+--:-:-:-:1      XMAD     hOffset,        loadIndex, param_ldh,      timeStep;
+--:-:-:-:1      LEA      hprevAddr0.CC,  hOffset,   param_hprev[0],     4;
+--:-:-:-:2      LEA.HI.X hprevAddr1,     hOffset,   param_hprev[1], RZ, 4;
+
+//loadBuffer = *hprevAddr
+--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;
+--:5:1:-:2  @P1 LDG.E.CI.128 loadBuffer, [hprevAddr];
+--:5:1:-:2 @!P1 LDS.U.128    loadBuffer, [addr_zero];
+
+//ldh = param_ldh << 12
+--:-:-:-:1      MOV ldh, param_ldh;
+--:-:-:-:1      SHL ldh, ldh, 12;
+</SCHEDULE_BLOCK>
+
+//Initialize all accumulation registers to 0
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 accum%02d, [addr_zero];\n", $_ * 4), 0..2;
+</CODE>
+
+//Update load index and load address
+--:-:-:-:6      IADD loadIndex, loadIndex, 256;
+--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;
+10:-:-:-:6      IADD   hprevAddr0.CC, hprevAddr0, ldh;
+--:-:-:-:6      IADD.X hprevAddr1,    hprevAddr1, RZ;
+
+01:-:-:-:1      STS.U.128 [storeHiddens], loadBuffer;
+
+//Unrolled GEMM loop
+<CODE>
+    our @top;
+
+    my $out = join '', @top;
+
+    my $rowsize = 1152;
+    my $weight_index = 0;
+
+    my $wait_flag = 2;
+    my $set_flag = 4;
+    my $read_buffer = 0;
+    my $write_buffer = 2;
+
+    for (my $k=0; $k < $rowsize; $k+=256)
+    {
+        if ($k == 0)
+        {
+            $out .= "--:6:1:-:1  \@P1 LDG.E.CI.128 loadBuffer, [hprevAddr];\n";
+            $out .= "--:-:1:-:1 \@!P1 LDS.U.128    loadBuffer, [addr_zero];\n\n";
+            $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+            $out .= "--:-:2:-:1      LDS.U.128 hidden0r, [loadHiddens];\n";
+            $out .= "--:-:3:-:1      LDS.U.128 hidden1r, [loadHiddens + 4x<4*16>];\n\n";
+        }
+        $out .= "--:-:-:-:1      LOP.XOR storeHiddens, storeHiddens, 4096;\n";
+
+        foreach my $shared_row (0 .. 15)
+        {
+            if($weight_index < 72)
+            {
+                if ($shared_row < 14 && ($k + (16 * ($shared_row + 2))) < $rowsize)
+                {
+                    my $read_bar = "-";
+                    if ($shared_row == 13 && ($k + 256) < $rowsize)
+                    {
+                        $read_bar = "5";
+                    }
+                    $out .= sprintf "--:%s:%d:-:1      LDS.U.128 hidden%dr, [loadHiddens + 4x<4*%d>];\n", $read_bar, $set_flag, $write_buffer, (16 * ($shared_row + 2));
+                }
+
+                if ($shared_row == 11)
+                {
+                    $out .= "--:-:-:-:1      IADD loadIndex, loadIndex, 256;\n";
+                    $out .= "20:-:-:-:1      IADD hprevAddr0.CC, hprevAddr0, ldh;\n";
+                }
+
+                if ($shared_row == 12)
+                {
+                    $out .= "--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;\n";
+                    $out .= "--:-:-:-:1      IADD.X hprevAddr1,    hprevAddr1, RZ;\n";
+                }
+
+                if ($shared_row == 13)
+                {
+                    $out .= "01:-:-:-:1      STS.U.128 [storeHiddens], loadBuffer;\n";
+
+                    if (($k + 512) < $rowsize)
+                    {
+                        $out .= "--:6:1:-:1  \@P1 LDG.E.CI.128 loadBuffer, [hprevAddr];\n";
+                        $out .= "--:-:1:-:1 \@!P1 LDS.U.128    loadBuffer, [addr_zero];\n\n";
+                    }
+                    else
+                    {
+                        $out .= "--:-:-:-:6      IADD     hOffset,        rowOffset, warpTid;\n";
+                        $out .= "--:-:-:-:6      XMAD     hOffset,        hOffset,   param_ldh,  timeStep;\n";
+                        $out .= "--:-:-:-:6      LEA      hprevAddr0.CC,  hOffset,   param_h[0],      4;\n";
+                        $out .= "--:-:-:-:2      LEA.HI.X hprevAddr1,     hOffset,   param_h[1], RZ, 4;\n";
+                        $out .= "--:-:6:-:1 \@P0 LDG.E.CI.128 loadBuffer, [hprevAddr];\n\n";
+                    }
+                }
+
+                if ($shared_row == 14)
+                {
+                    $out .= "10:-:-:-:1      LOP.XOR loadHiddens, loadHiddens, 4096;\n";
+                    $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+                    $out .= sprintf "--:-:%d:-:1      LDS.U.128 hidden%dr, [loadHiddens];\n", $set_flag, $write_buffer;
+                }
+
+                if ($shared_row == 15)
+                {
+                    $out .= sprintf "--:-:%d:-:1      LDS.U.128 hidden%dr, [loadHiddens + 4x<4*16>];\n\n", $set_flag, $write_buffer;
+                }
+
+                foreach my $row (0 .. 2)
+                {
+                    my $weight = ($row * 72) + $weight_index;
+
+                    foreach my $col (0 .. 3)
+                    {
+                        my $accum = ($row * 4) + $col;
+                        my $wait = "--";
+                        my $stall = 1;
+                        if ($accum == 0)
+                        {
+                            if ($weight_index == 0)
+                            {
+                                $wait = sprintf "%02x", (0x20 | (1 << ($wait_flag - 1)));
+                            }
+                            else
+                            {
+                                $wait = sprintf "%02x", (1 << ($wait_flag - 1));
+                            }
+                        }
+
+                        if ($row == 2 && $col == 3)
+                        {
+                            if ($shared_row < 13 && ($k + (16 * ($shared_row + 3))) < $rowsize)
+                            {
+                                $stall = 0;
+                            }
+                            elsif ($shared_row == 14 && ($k + 256) < $rowsize)
+                            {
+                                $stall = 0;
+                            }
+                        }
+
+                        $out .= sprintf "%s:-:-:-:%d      FFMA accum%02d, weight%03d, hidden%dr%d, accum%02d;\n", $wait, $stall, $accum, $weight, $read_buffer, $col, $accum;
+                    }
+                }
+
+                $weight_index++;
+            }
+
+            $wait_flag += 1;
+            $set_flag += 1;
+            $read_buffer += 1;
+            $write_buffer += 1;
+            if($wait_flag == 5)
+            {
+                $wait_flag = 2;
+            }
+            if($set_flag == 5)
+            {
+                $set_flag = 2;
+            }
+            if($read_buffer == 3)
+            {
+                $read_buffer = 0;
+            }
+            if($write_buffer == 3)
+            {
+                $write_buffer = 0;
+            }
+        }
+    }
+
+    return $out;
+</CODE>
+
+//Reduction between threads
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 1, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 1, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 1, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 1, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 1, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 2, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 2, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 2, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 2, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 2, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 2, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 2, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 2, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 4, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 4, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 4, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 4, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 4, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 4, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 4, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 4, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 8, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 8, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 8, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 8, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 8, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 8, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 8, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 8, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+//Compute store pointer
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD     hRow,       rowOffset,  warpTid;
+--:-:-:-:1      XMAD     storeIndex, hRow,       param_ldh, timeStep;
+--:-:-:-:1      LEA      hAddr0.CC,  storeIndex, param_h[0],      4;
+--:-:-:-:1      LEA.HI.X hAddr1,     storeIndex, param_h[1], RZ, 4;
+--:-:-:-:1      LEA      lockAddr0,  timeStep,   param_lockAddr[0], 2;
+--:-:-:-:1      LEA.HI.X lockAddr1,  timeStep,   param_lockAddr[1], RZ, 2;
+
+//Conditional select for output
+--:-:-:-:1  @P2 MOV output0, accum00;
+--:-:-:-:1  @P3 MOV output0, accum04;
+--:-:-:-:1  @P4 MOV output0, accum08;
+
+--:-:-:-:1  @P2 MOV output1, accum01;
+--:-:-:-:1  @P3 MOV output1, accum05;
+--:-:-:-:1  @P4 MOV output1, accum09;
+
+--:-:-:-:1  @P2 MOV output2, accum02;
+--:-:-:-:1  @P3 MOV output2, accum06;
+--:-:-:-:1  @P4 MOV output2, accum10;
+
+--:-:-:-:1  @P2 MOV output3, accum03;
+--:-:-:-:1  @P3 MOV output3, accum07;
+--:-:-:-:3  @P4 MOV output3, accum11;
+
+//Update timestep
+--:-:-:-:1      ISETP.EQ.AND P5, PT, RZ, param_reverse, PT;
+--:-:-:-:1  @P5 MOV setVal, 1;
+--:-:-:-:1 @!P5 MOV setVal, -1;
+--:-:-:-:1  @P5 MOV expectVal, param_seqLength;
+--:-:-:-:1 @!P5 MOV expectVal, -1;
+--:-:-:-:1      IADD timeStep, timeStep, setVal;
+</SCHEDULE_BLOCK>
+
+//Save select predicates
+--:-:-:-:1      P2R predSave, PR, RZ, 0x0c;
+
+--:-:-:-:1      MOV reluclip, param_reluclip;
+
+//Add bias for output
+--:-:-:-:1      FADD output0, output0, biasValue;
+--:-:-:-:1      FADD output1, output1, biasValue;
+--:-:-:-:1      FADD output2, output2, biasValue;
+--:-:-:-:3      FADD output3, output3, biasValue;
+
+//Accumulate on top of current data
+20:-:-:-:1      FADD output0, output0, loadBuffer0;
+--:-:-:-:1      FADD output1, output1, loadBuffer1;
+--:-:-:-:1      FADD output2, output2, loadBuffer2;
+--:-:-:-:3      FADD output3, output3, loadBuffer3;
+
+//Activation function
+//TODO: add others
+--:-:-:-:2  FMNMX output0, output0, RZ, !PT;
+--:-:-:-:2  FMNMX output1, output1, RZ, !PT;
+--:-:-:-:2  FMNMX output2, output2, RZ, !PT;
+--:-:-:-:2  FMNMX output3, output3, RZ, !PT;
+
+--:-:-:-:2  FMNMX output0, output0, reluclip, PT;
+--:-:-:-:2  FMNMX output1, output1, reluclip, PT;
+--:-:-:-:2  FMNMX output2, output2, reluclip, PT;
+--:-:-:-:2  FMNMX output3, output3, reluclip, PT;
+
+//Conditional store
+--:-:-:-:1  @P0 STG.E.CI.128 [hAddr], output;
+
+//Compute predicate for time unrolling loop
+--:-:-:Y:d      ISETP.NE.AND P5, PT, timeStep, expectVal, PT;
+
+//P2 = (tid != 0)
+//setVal = 1
+--:-:-:-:1      ISETP.NE.AND P2, PT, tid, RZ, PT;
+--:-:-:-:1      MOV expectVal, param_numBlks;
+--:-:-:Y:b      MOV setVal, 1;
+
+//Barrier for all blocks
+--:-:-:-:f      MEMBAR.GL;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:2      SSY SSY_TARGET1;
+--:-:-:-:d  @P2 SYNC;
+
+--:-:-:Y:2      ATOM.E.ADD RZ, [lockAddr], setVal;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET1:
+--:-:-:-:1      SSY SSY_TARGET2;
+--:-:-:-:d  @P2 SYNC;
+
+SPINLOCK:
+--:-:1:Y:2      LDG.E lockVal, [lockAddr];
+01:-:-:Y:d      ISETP.NE.AND P2, PT, lockVal, expectVal, PT;
+--:-:-:-:5  @P2 BRA.U SPINLOCK;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET2:
+--:-:-:-:5      BAR.SYNC 0;
+
+//Restore select predicates
+--:-:-:-:1      R2P PR, predSave, 0x0c;
+
+//Conditional branch back to beginning of loop
+--:-:-:Y:5  @P5 BRA.U UNROLLING_LOOP;
+
+--:-:-:-:5      EXIT;
diff --git a/Kernel/Convolution/Maxwell/sconv_bprop_C1_N64.sass b/Kernel/Convolution/Maxwell/sconv_bprop_C1_N64.sass
new file mode 100644
index 0000000..070db8c
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/sconv_bprop_C1_N64.sass
@@ -0,0 +1,600 @@
+# Kernel: sconv_bprop_C32_N64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_lut : 4x<64*4>
+
+    param_I[0]         : c[0x0][0x140]
+    param_I[1]         : c[0x0][0x144]
+    param_E[0]         : c[0x0][0x148]
+    param_E[1]         : c[0x0][0x14c]
+    param_F[0]         : c[0x0][0x150]
+    param_F[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_N            : c[0x0][0x15c]
+    param_K            : c[0x0][0x160]
+    param_D            : c[0x0][0x164]
+    param_H            : c[0x0][0x168]
+    param_W            : c[0x0][0x16c]
+    param_WN           : c[0x0][0x170]
+    param_HWN          : c[0x0][0x174]
+    param_DHWN         : c[0x0][0x178]
+    param_C            : c[0x0][0x17c]
+    param_CRST         : c[0x0][0x180]
+    param_RST          : c[0x0][0x184]
+    param_magic_RST    : c[0x0][0x188]
+    param_shift_RST    : c[0x0][0x18c]
+    param_RS           : c[0x0][0x190]
+    param_magic_RS     : c[0x0][0x194]
+    param_shift_RS     : c[0x0][0x198]
+    param_S            : c[0x0][0x19c]
+    param_magic_S      : c[0x0][0x1a0]
+    param_shift_S      : c[0x0][0x1a4]
+    param_pad_d        : c[0x0][0x1a8]
+    param_pad_h        : c[0x0][0x1ac]
+    param_pad_w        : c[0x0][0x1b0]
+    param_str_d        : c[0x0][0x1b4]
+    param_str_h        : c[0x0][0x1b8]
+    param_str_w        : c[0x0][0x1bc]
+    param_Q            : c[0x0][0x1c0]
+    param_PQ           : c[0x0][0x1c4]
+    param_QN           : c[0x0][0x1c8]
+    param_PQN          : c[0x0][0x1cc]
+    param_MPQN         : c[0x0][0x1d0]
+    param_magic_Q      : c[0x0][0x1d4]
+    param_shift_Q      : c[0x0][0x1d8]
+    param_magic_PQ     : c[0x0][0x1dc]
+    param_shift_PQ     : c[0x0][0x1e0]
+    param_CRST8        : c[0x0][0x1e4]
+    param_MPQN8        : c[0x0][0x1e8]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-67 ~ tid, blkE, blkF, blkMPQ
+
+     68-119 ~ k<0|4>, tidX, tid1, m, p, q, crst, n, n32, tf<0|4>, te, te<0|4>, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+      64-79 : j0Ex<0-7>, j0Fy<0-7>
+      80-95 : j1Ex<0-7>, j1Fy<0-7>
+
+     96-103 : load0F<0-3>, load4F<0-3>
+    104-119 : load0E<0-7>, load4E<0-7>
+
+    120-123 : track0F<0-1>, track4F<0-1>
+    124-127 : track0E<0-1>, track4E<0-1>
+
+    128-131 ~ writeEs, writeFs, swapBuf, K
+    132-136 ~ readEs, readFs, mt, pr, qs
+
+     68-71  ~ lutStore, sliceI
+     72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD
+
+     72-89  : c<0-7>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1>
+     90-132 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,    SR_TID.X;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkF,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+// tidX  = (tid & 7) << 2
+// k     = tid >> 3
+01:-:-:-:1      LOP.AND tidX, tid,  7;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 k0,   tid,  3;
+--:-:-:-:1      IADD    k4,   k0,   4;
+
+--:-:-:-:1      MOV K, param_K;
+
+--:-:-:-:1      STS.128 [RZ], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [RZ];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+08:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+
+// mt = m * w - pad_d
+// pr = p * u - pad_h
+// qs = q * v - pad_w
+--:-:-:-:1      XMAD mt, m,   param_str_d, RZ;
+--:-:-:-:1      XMAD pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD qs, q,   param_str_w, RZ;
+--:-:-:-:1      IADD mt, mt, -param_pad_d;
+--:-:-:-:1      IADD pr, pr, -param_pad_h;
+--:-:-:-:1      IADD qs, qs, -param_pad_w;
+
+// crst = blkF*32 + tidX
+// n    = blkE*64 + tidX
+04:-:-:-:1      ISCADD crst, blkF, tidX, 5;
+08:-:-:-:1      ISCADD n,    blkE, tidX, 6;
+--:-:-:-:1      IADD   n32,  n,    32;
+
+// trackF = k*CRST + crst
+--:-:-:-:1      XMAD     tf0, k0, param_CRST, crst;
+--:-:-:-:1      XMAD     tf4, k4, param_CRST, crst;
+--:-:-:-:1      LEA      track0F0.CC, tf0, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X track0F1,    tf0, param_F[1], RZ, 2;
+--:-:-:-:1      LEA      track4F0.CC, tf4, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X track4F1,    tf4, param_F[1], RZ, 2;
+
+// trackE = k*MPQN + m*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD      te,  q,  param_N,    n;
+--:-:-:-:1      XMAD.LO2C te,  p,  param_QN,   te;
+--:-:-:-:1      XMAD.LO2C te,  m,  param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te0, k0, param_MPQN, te;
+--:-:-:-:1      XMAD.LO2C te4, k4, param_MPQN, te;
+--:-:-:-:1      LEA       track0E0.CC, te0, param_E[0],     2;
+--:-:-:-:1      LEA.HI.X  track0E1,    te0, param_E[1], RZ, 2;
+--:-:-:-:1      LEA       track4E0.CC, te4, param_E[0],     2;
+--:-:-:-:1      LEA.HI.X  track4E1,    te4, param_E[1], RZ, 2;
+
+// P1 = crst < CRST
+// P2 = n    < N
+// P3 = n+32 < N
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst, param_CRST, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, n,    param_N,    PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, n32,  param_N,    PT;
+
+// writeFs = (32*k + tidX) * 4
+--:-:-:-:1      ISCADD  writeFs, k0, tidX, 5;
+--:-:-:-:1      SHL     writeFs, writeFs,  2;
+// writeEs = (64*k + tidX) * 4
+--:-:-:-:1      ISCADD  writeEs, k0, tidX, 6;
+--:-:-:-:1      ISCADD  writeEs, writeEs, 4x<32*8>, 2;
+
+// readFs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readEs = ((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<32*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<32*8 + 64*8>;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD K, K, -8;
+
+--:-:-:-:1  @P1 LDG.E.CI load0F0, [track0F + 4x<0>];
+--:-:-:-:1  @P1 LDG.E.CI load0F1, [track0F + 4x<1>];
+--:-:-:-:1  @P1 LDG.E.CI load0F2, [track0F + 4x<2>];
+--:-:1:-:1  @P1 LDG.E.CI load0F3, [track0F + 4x<3>];
+
+--:-:-:-:1  @P1 LDG.E.CI load4F0, [track4F + 4x<0>];
+--:-:-:-:1  @P1 LDG.E.CI load4F1, [track4F + 4x<1>];
+--:-:-:-:1  @P1 LDG.E.CI load4F2, [track4F + 4x<2>];
+--:-:2:-:1  @P1 LDG.E.CI load4F3, [track4F + 4x<3>];
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, K, RZ, P1;
+
+--:-:3:-:1  @P2 LDG.E.128 load0E0, [track0E + 4x< 0>];
+--:-:4:-:1  @P3 LDG.E.128 load0E4, [track0E + 4x<32>];
+--:-:5:-:1  @P2 LDG.E.128 load4E0, [track4E + 4x< 0>];
+--:-:6:-:1  @P3 LDG.E.128 load4E4, [track4E + 4x<32>];
+
+--:-:-:-:2      ISETP.GT.AND P2, PT, K, RZ, P2;
+--:-:-:-:0      ISETP.GT.AND P3, PT, K, RZ, P3;
+
+01:-:-:-:1      STS.128 [writeFs + 4x<0*32>], load0F;
+--:-:-:-:6      IADD   track0F0.CC, track0F0, param_CRST8;
+--:-:-:-:0      IADD.X track0F1,    track0F1, RZ;
+
+02:-:-:-:1      STS.128 [writeFs + 4x<4*32>], load4F;
+--:-:-:-:6      IADD   track4F0.CC, track4F0, param_CRST8;
+--:-:-:-:0      IADD.X track4F1,    track4F1, RZ;
+
+04:-:-:-:1      STS.128 [writeEs + 4x<0*64 +  0>], load0E0;
+08:-:-:-:1      STS.128 [writeEs + 4x<0*64 + 32>], load0E4;
+--:-:-:-:6      IADD   track0E0.CC, track0E0, param_MPQN8;
+--:-:-:-:0      IADD.X track0E1,    track0E1, RZ;
+
+10:-:-:-:1      STS.128 [writeEs + 4x<4*64 +  0>], load4E0;
+20:1:-:-:1      STS.128 [writeEs + 4x<4*64 + 32>], load4E4;
+--:-:-:-:6      IADD   track4E0.CC, track4E0, param_MPQN8;
+--:-:-:-:1      IADD.X track4E1,    track4E1, RZ;
+
+01:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
+--:-:-:-:2      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD K, K, -8;
+
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*32 + 16>];
+
+--:-:-:-:1  @P1 LDG.E.CI load0F0, [track0F + 4x<0>];
+--:-:-:-:1  @P1 LDG.E.CI load0F1, [track0F + 4x<1>];
+--:-:-:-:1  @P1 LDG.E.CI load0F2, [track0F + 4x<2>];
+--:-:2:-:1  @P1 LDG.E.CI load0F3, [track0F + 4x<3>];
+
+--:-:-:-:1  @P1 LDG.E.CI load4F0, [track4F + 4x<0>];
+--:-:-:-:1  @P1 LDG.E.CI load4F1, [track4F + 4x<1>];
+--:-:-:-:1  @P1 LDG.E.CI load4F2, [track4F + 4x<2>];
+--:-:3:-:1  @P1 LDG.E.CI load4F3, [track4F + 4x<3>];
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, K, RZ, P1;
+
+--:-:4:-:1  @P2 LDG.E.128 load0E0, [track0E + 4x< 0>];
+--:-:4:-:1  @P3 LDG.E.128 load0E4, [track0E + 4x<32>];
+--:-:5:-:1  @P2 LDG.E.128 load4E0, [track4E + 4x< 0>];
+--:-:5:-:1  @P3 LDG.E.128 load4E4, [track4E + 4x<32>];
+
+--:-:-:-:2      ISETP.GT.AND P2, PT, K, RZ, P2;
+--:-:-:-:1      ISETP.GT.AND P3, PT, K, RZ, P3;
+
+NEXT_8K:
+--:-:-:-:1      ISETP.GT.AND P0, PT, K, -8, PT;
+<CODE>
+    my %insert =
+    (
+        j0c8  => "--:-:-:-:1      IADD K, K, -8;\n",
+
+        j0c12 => "02:2:-:-:1  \@P0 STS.128 [writeFs + 4x<0*32>], load0F;\n",
+        j0c14 => "--:-:-:-:1  \@P0 IADD   track0F0.CC, track0F0, param_CRST8;\n",
+        j0c19 => "--:-:-:-:1  \@P0 IADD.X track0F1,    track0F1, RZ;\n",
+
+        j0c56 => "02:-:-:-:1  \@P1 LDG.E.CI load0F0, [track0F + 4x<0>];\n",
+        j0c58 => "--:-:-:-:1  \@P1 LDG.E.CI load0F1, [track0F + 4x<1>];\n",
+        j0c60 => "--:-:-:-:1  \@P1 LDG.E.CI load0F2, [track0F + 4x<2>];\n",
+        j0c62 => "--:-:2:-:1  \@P1 LDG.E.CI load0F3, [track0F + 4x<3>];\n",
+
+        j2c12 => "04:3:-:-:1  \@P0 STS.128 [writeFs + 4x<4*32>], load4F;\n",
+        j2c14 => "--:-:-:-:1  \@P0 IADD   track4F0.CC, track4F0, param_CRST8;\n",
+        j2c19 => "--:-:-:-:1  \@P0 IADD.X track4F1,    track4F1, RZ;\n",
+
+        j2c56 => "04:-:-:-:1  \@P1 LDG.E.CI load4F0, [track4F + 4x<0>];\n",
+        j2c58 => "--:-:-:-:1  \@P1 LDG.E.CI load4F1, [track4F + 4x<1>];\n",
+        j2c60 => "--:-:-:-:1  \@P1 LDG.E.CI load4F2, [track4F + 4x<2>];\n",
+        j2c62 => "--:-:3:-:1  \@P1 LDG.E.CI load4F3, [track4F + 4x<3>];\n",
+
+        j4c12 => "08:-:-:-:1  \@P0 STS.128 [writeEs + 4x<0*64 +  0>], load0E0;\n",
+        j4c14 => "--:4:-:-:1  \@P0 STS.128 [writeEs + 4x<0*64 + 32>], load0E4;\n",
+        j4c16 => "--:-:-:-:1  \@P0 IADD   track0E0.CC, track0E0, param_MPQN8;\n",
+        j4c21 => "--:-:-:-:1  \@P0 IADD.X track0E1,    track0E1, RZ;\n",
+
+        j4c60 => "08:-:-:-:1  \@P2 LDG.E.128 load0E0, [track0E + 4x< 0>];\n",
+        j4c62 => "--:-:4:-:1  \@P3 LDG.E.128 load0E4, [track0E + 4x<32>];\n",
+
+        j6c12 => "10:-:-:-:1  \@P0 STS.128 [writeEs + 4x<4*64 +  0>], load4E0;\n",
+        j6c14 => "--:5:-:-:1  \@P0 STS.128 [writeEs + 4x<4*64 + 32>], load4E4;\n",
+        j6c16 => "--:-:-:-:1  \@P0 IADD   track4E0.CC, track4E0, param_MPQN8;\n",
+        j6c21 => "--:-:-:-:1  \@P0 IADD.X track4E1,    track4E1, RZ;\n",
+
+        j6c60 => "10:-:-:-:1  \@P2 LDG.E.128 load4E0, [track4E + 4x< 0>];\n",
+        j6c62 => "--:-:5:-:1  \@P3 LDG.E.128 load4E4, [track4E + 4x<32>];\n",
+
+        j6c63 => "--:-:-:-:1  \@P0 IADD readEs,  readEs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeEs, writeEs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c8  => "--:-:-:-:1      ISETP.GT.AND P1, PT, K, RZ, P1;\n",
+        j7c10 => "--:-:-:-:1      ISETP.GT.AND P2, PT, K, RZ, PT;\n",
+        j7c12 => "--:-:-:-:1      ISETP.GT.AND P3, PT, K, RZ, PT;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_8K;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:0      MOV warp_cnt, 32;
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkF, SR_CTAID.Y;
+--:-:3:-:1      S2R blkE, SR_CTAID.Z;
+01:-:-:-:6      MOV rst,  tid;
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_cnt < RST (c=0)
+--:-:-:-:1      ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT;
+--:-:-:-:1      IADD warp_cnt, warp_cnt, 32;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = qs + s
+// y = pr + r
+// z = mt + t
+--:-:-:-:1      IADD z, mt, t;
+--:-:-:-:1      IADD y, pr, r;
+--:-:-:-:1      IADD x, qs, s;
+// i = (z*HWN + y*WN + x*N) * 4
+20:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD      sliceI, x, param_N,   sliceI;
+--:-:-:-:1      SHL       sliceI, sliceI, 2;
+// Bounds check x and y, and make i negative if outside
+--:-:-:-:1      ISET.LT.AND x0, x, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW, x,  param_W, PT;
+--:-:-:-:1      ISET.LT.AND y0, y, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH, y,  param_H, PT;
+--:-:-:-:1      ISET.LT.AND z0, z, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD, z,  param_D, PT;
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, x0, xW, 0xfe;
+<ORDERED>
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, y0, yH, 0xfe;
+--:-:-:-:1      SHL lutStore, rst, 2;
+--:-:-:-:1      IADD rst, rst, 32;
+</ORDERED>
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, z0, zD, 0xfe;
+// Store i imgOffset into the shared lookup table
+--:6:-:-:1      STS [lutStore + addr_lut], sliceI;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U LUT_LOOP;
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV RST,       param_RST;
+--:-:-:-:1      MOV DHWN1,     param_DHWN;
+--:-:-:-:1      SHL DHWN1,     DHWN1, 2;
+
+--:-:-:-:1      LOP.AND readEs, readEs, 0x7f;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x3f;
+
+// writeCs = ((readIs / 4) * 64 + readEs);
+--:-:-:-:1      ISCADD  writeCs, readFs, readEs, 4;
+
+// readCs = (tid & 31) << 2;
+--:-:-:-:1      LOP.AND tid31,  tid,   31;
+--:-:-:-:1      SHL     readCs, tid31, 2;
+
+// nn = blkE*64 + tid31;
+04:-:-:-:1      ISCADD nn, blkE, tid31, 6;
+
+// crst = blkF*32
+02:-:-:-:1      SHL  crst00, blkF,   5;
+--:-:-:-:1      IADD crst04, crst00, 4;
+--:-:-:-:1      IADD crst08, crst00, 8;
+--:-:-:-:1      IADD crst12, crst00, 12;
+
+--:-:-:-:1      LEA      trackI0.CC, nn, param_I[0],     2;
+--:-:-:-:1      LEA.HI.X trackI1,    nn, param_I[1], RZ, 2;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P5, PT, nn, param_N, PT;
+--:-:-:-:1      IADD nn, nn, 32;
+--:-:-:-:1      ISETP.LT.AND P6, PT, nn, param_N, PT;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:1      IADD crst00, crst00, 12;\n" .
+            "--:-:-:-:1      IADD crst04, crst04, 12;\n" .
+            "--:-:-:-:1      IADD crst08, crst08, 12;\n" .
+            "--:-:-:-:1      IADD crst12, crst12, 12;\n" if $y == 4;
+
+        $out .= sprintf(
+            "01:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "02:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "04:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "08:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], c4;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS c1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<1*64 + 00>];
+--:-:-:-:1      LDS c3, [readCs + 4x<1*64 + 32>];
+--:-:-:-:1      LDS c4, [readCs + 4x<2*64 + 00>];
+--:-:-:-:1      LDS c5, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      LDS c6, [readCs + 4x<3*64 + 00>];
+--:-:-:-:1      LDS c7, [readCs + 4x<3*64 + 32>];
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5;
+
+--:-:-:-:1      XMAD.LO2C c00, crst00, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c04, crst04, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c08, crst08, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c12, crst12, param_magic_RST, RZ;
+
+--:-:-:-:1      SHR.U32 c00, c00, param_shift_RST;
+--:-:-:-:1      SHR.U32 c04, c04, param_shift_RST;
+--:-:-:-:1      SHR.U32 c08, c08, param_shift_RST;
+--:-:-:-:1      SHR.U32 c12, c12, param_shift_RST;
+
+--:-:-:-:1      VMAD.U16.U16 lut00, -c00, RST, crst00;
+--:-:-:-:1      VMAD.U16.U16 lut04, -c04, RST, crst04;
+--:-:-:-:1      VMAD.U16.U16 lut08, -c08, RST, crst08;
+--:-:-:-:1      VMAD.U16.U16 lut12, -c12, RST, crst12;
+
+--:-:-:-:1      SHL lut00, lut00, 2;
+--:-:-:-:1      SHL lut04, lut04, 2;
+--:-:-:-:1      SHL lut08, lut08, 2;
+--:-:-:-:1      SHL lut12, lut12, 2;
+
+--:-:-:-:1      XMAD.LO2 chan00, DHWN1, c00, RZ;
+--:-:-:-:1      XMAD.LO2 chan04, DHWN1, c04, RZ;
+--:-:-:-:1      XMAD.LO2 chan08, DHWN1, c08, RZ;
+--:-:-:-:1      XMAD.LO2 chan12, DHWN1, c12, RZ;
+
+--:-:-:-:1      IADD crst00, crst00, 1;
+--:-:-:-:1      IADD crst04, crst04, 1;
+--:-:-:-:1      IADD crst08, crst08, 1;
+--:-:-:-:1      IADD crst12, crst12, 1;
+
+--:-:1:-:1  @P0 LDS img00, [lut00 + addr_lut];
+--:-:2:-:1  @P1 LDS img04, [lut04 + addr_lut];
+--:-:3:-:1  @P2 LDS img08, [lut08 + addr_lut];
+--:-:4:-:1  @P3 LDS img12, [lut12 + addr_lut];
+
+</SCHEDULE_BLOCK>
+
+01:-:-:-:1      IADD3  track00I0.CC, trackI0, img00, chan00;
+--:-:-:-:5      ISETP.GE.AND P0, PT, img00, RZ, P0;
+--:-:-:-:1      IADD.X track00I1,    trackI1, RZ;
+
+02:-:-:-:1      IADD3  track04I0.CC, trackI0, img04, chan04;
+--:-:-:-:5      ISETP.GE.AND P1, PT, img04, RZ, P1;
+--:-:-:-:1      IADD.X track04I1,    trackI1, RZ;
+
+04:-:-:-:1      IADD3  track08I0.CC, trackI0, img08, chan08;
+--:-:-:-:5      ISETP.GE.AND P2, PT, img08, RZ, P2;
+--:-:-:-:1      IADD.X track08I1,    trackI1, RZ;
+
+08:-:-:-:1      IADD3  track12I0.CC, trackI0, img12, chan12;
+--:-:-:-:5      ISETP.GE.AND P3, PT, img12, RZ, P3;
+--:-:-:-:0      IADD.X track12I1,    trackI1, RZ;
+
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00I], c0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04I], c2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08I], c4;
+--:-:-:-:3      PSETP.AND.AND P2, PT, P2, P6, PT;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12I], c6;
+--:-:-:-:5      PSETP.AND.AND P3, PT, P3, P6, PT;
+
+--:1:-:-:2  @P0 RED.E.ADD.F32.FTZ.RN [track00I + 4x<32>], c1;
+--:2:-:-:2  @P1 RED.E.ADD.F32.FTZ.RN [track04I + 4x<32>], c3;
+--:3:-:-:4  @P2 RED.E.ADD.F32.FTZ.RN [track08I + 4x<32>], c5;
+--:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12I + 4x<32>], c7;
+
+--:-:-:-:5      RET;
+
diff --git a/Kernel/Convolution/Maxwell/sconv_updat_C128_K128.sass b/Kernel/Convolution/Maxwell/sconv_updat_C128_K128.sass
new file mode 100644
index 0000000..dfb6bea
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/sconv_updat_C128_K128.sass
@@ -0,0 +1,718 @@
+# Kernel: sconv_updat_C128_K128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*4 + 0>
+    addr_m    : 4x<(128*16 + 32)*4 + 4>
+    addr_q    : 4x<(128*16 + 32)*4 + 5>
+    szBuf     : (128*16 + 32)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_offset_K     : c[0x0][0x15c]
+    param_N            : c[0x0][0x160]
+    param_K            : c[0x0][0x164]
+    param_D            : c[0x0][0x168]
+    param_H            : c[0x0][0x16c]
+    param_W            : c[0x0][0x170]
+    param_WN           : c[0x0][0x174]
+    param_HWN          : c[0x0][0x178]
+    param_DHWN         : c[0x0][0x17c]
+    param_C            : c[0x0][0x180]
+    param_CRST         : c[0x0][0x184]
+    param_RST          : c[0x0][0x188]
+    param_magic_RST    : c[0x0][0x18c]
+    param_shift_RST    : c[0x0][0x190]
+    param_RS           : c[0x0][0x194]
+    param_magic_RS     : c[0x0][0x198]
+    param_shift_RS     : c[0x0][0x19c]
+    param_S            : c[0x0][0x1a0]
+    param_magic_S      : c[0x0][0x1a4]
+    param_shift_S      : c[0x0][0x1a8]
+    param_pad_d        : c[0x0][0x1ac]
+    param_pad_h        : c[0x0][0x1b0]
+    param_pad_w        : c[0x0][0x1b4]
+    param_str_d        : c[0x0][0x1b8]
+    param_str_h        : c[0x0][0x1bc]
+    param_str_w        : c[0x0][0x1c0]
+    param_dil_d        : c[0x0][0x1c4]
+    param_dil_h        : c[0x0][0x1c8]
+    param_dil_w        : c[0x0][0x1cc]
+    param_P            : c[0x0][0x1d0]
+    param_Q            : c[0x0][0x1d4]
+    param_PQ           : c[0x0][0x1d8]
+    param_QN           : c[0x0][0x1dc]
+    param_PQN          : c[0x0][0x1e0]
+    param_MPQN         : c[0x0][0x1e4]
+    param_magic_Q      : c[0x0][0x1e8]
+    param_shift_Q      : c[0x0][0x1ec]
+    param_magic_PQ     : c[0x0][0x1f0]
+    param_shift_PQ     : c[0x0][0x1f4]
+    param_grid_P       : c[0x0][0x1f8]
+    param_grid_Q       : c[0x0][0x1fc]
+    param_grid_PQ      : c[0x0][0x200]
+    param_CRSTK        : c[0x0][0x204]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-67   ~ tid, blkI, blkE, one
+    68-111  ~ tidX, tidY, tid1, tid7, tid128, shiftX, blkMPQ, m, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+    64-95   ~ tidYY, mm, mt, pr, y, z, y0, yH, z0, zD, bounds_yz, c, r, t, rs, rst
+    64-95   ~ qs, x, x0, xW, bounds_x, ti, te, Q
+
+    64-79   : j0Ex<0-7>, j0Iy<0-7>
+    80-95   : j1Ex<0-7>, j1Iy<0-7>
+
+    96-111  : loadI<0-7>,  loadE<0-7>
+    112-115 : trackI<0-1>, trackE<0-1>
+
+    116-124 ~ writeS, loopN, e, i, p, q, k, crst, s
+    125-127 ~ swapBuf, readIs, readEs
+
+     68-83  : c<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+    84-124  ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128, blk_MPQ, CRSTK, xmad_determ
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID1;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkI,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+--:-:-:-:5      BRA.U END_CTAID1;
+CTAID1:
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.Z;
+--:-:3:-:1      S2R blkI,   SR_CTAID.X;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Y;
+END_CTAID1:
+
+<SCHEDULE_BLOCK>
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 2
+// shiftX = (tid & 1) << 4
+01:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tid1, 2;
+--:-:-:-:1      SHL     shiftX, tid1, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      PSETP.AND.AND P0, PT, PT, PT, PT;
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_grid_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_grid_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+
+// We need to be able to restore m and q at each P iteration
+// Register spill to shared
+--:1:-:-:1      STS [addr_m], m;
+--:-:-:-:1      STS [addr_q], q;
+
+// writeBs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeS, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeS, writeS, shiftX;
+--:-:-:-:1      ISCADD writeS, writeS, 4x<szBuf * 2>, 2;
+
+// readIs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readIs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf;
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readEs, tid128, 4;
+--:-:-:-:1      LOP.OR  readEs, readEs, tid7;
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<szBuf>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szBuf * 2>;
+
+// crst = blockI*128 + tid
+04:-:-:-:1      ISCADD crst, blkI, tidX, 7;
+
+// k = blockE*128 + tid
+08:-:-:-:1      ISCADD k, blkE, tidX, 7;
+--:-:-:-:1      IADD   k, k, param_offset_K;
+
+--:-:-:-:1      MOV loopN, param_N;
+
+</SCHEDULE_BLOCK>
+
+NEXT_P:
+
+01:-:4:-:1      S2R tidYY, SR_TID.X;
+--:-:5:-:1      LDS mm, [addr_m];
+
+<SCHEDULE_BLOCK>
+--:-:6:-:1      LDS q, [addr_q];
+
+// c   = crst / RST
+// rst = crst % RST
+--:-:-:-:1      XMAD.LO2C c, crst, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32   c, c, param_shift_RST;
+--:-:-:-:1      XMAD rst, c, param_RST, RZ;
+--:-:-:-:1      IADD rst, -rst, crst;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// y = p * u - pad_h + (r * dil_h)
+// z = m * w - pad_d + (t * dil_d)
+--:-:-:-:1      XMAD  pr, p,   param_str_h, RZ;
+10:-:-:-:1      XMAD  mt, mm,  param_str_d, RZ;
+--:-:-:-:1      XMAD  y,  r,   param_dil_h, pr;
+--:-:-:-:1      XMAD  z,  t,   param_dil_d, mt;
+--:-:-:-:1      IADD  y,  y,  -param_pad_h;
+--:-:-:-:1      IADD  z,  z,  -param_pad_d;
+// e = k*MPQN + m*PQN + p*QN + tidYY
+08:-:-:-:1      LOP.AND tidYY, tidYY, 1;
+--:-:-:-:1      SHL     tidYY, tidYY, 2;
+--:-:-:-:1      XMAD.LO2C e, p,  param_QN,   tidYY;
+--:-:-:-:1      XMAD.LO2C e, mm, param_PQN,  e;
+--:-:-:-:1      XMAD.LO2C e, k,  param_MPQN, e;
+// i = c*DHWN + z*HWN + y*WN + tidYY
+--:-:-:-:1      XMAD.LO2C i, y, param_WN,   tidYY;
+--:-:-:-:1      XMAD.LO2C i, z, param_HWN,  i;
+--:-:-:-:1      XMAD.LO2C i, c, param_DHWN, i;
+// bounds_yz = y < 0 || y > H || z < 0 || z > D ? -1 : 0
+--:-:-:-:1      ISET.LT.AND y0, y,  RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH, y,  param_H, PT;
+--:-:-:-:1      ISET.LT.AND z0, z,  RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD, z,  param_D, PT;
+--:-:-:-:1      LOP.OR   bounds_yz, y0, yH;
+--:-:-:-:1      LOP3.LUT bounds_yz, bounds_yz, z0, zD, 0xfe;
+// doLoadCRST = crst < CRST && bounds_yz == 0
+--:-:-:-:1      ISETP.LT.AND P4, PT, crst, param_CRST, PT;
+--:-:-:-:1      ISETP.EQ.AND P4, PT, bounds_yz, RZ, P4;
+// p += grid_P
+--:-:-:-:1      IADD p, p, param_grid_P;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, p, param_P, PT;
+</SCHEDULE_BLOCK>
+
+NEXT_Q:
+
+<SCHEDULE_BLOCK>
+// Zigzag q but only if grid_P < P
+--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
+--:-:-:-:1      MOV Q, param_grid_P;
+--:-:-:-:1      ISETP.LT.AND P1, PT, Q, param_P, P1;
+--:-:-:-:1      MOV Q, -1;
+20:-:-:-:1  @P1 IADD3 Q, -q, param_Q, Q;
+--:-:-:-:1 @!P1 MOV Q, q;
+// k < K
+--:-:-:-:1      ISETP.LT.AND P3, PT, k, param_K, PT;
+// qs = q * v - pad_w
+// x = qs + (s dil_w)
+--:-:-:-:1      XMAD  qs, Q,  param_str_w, RZ;
+--:-:-:-:1      XMAD  x,  s,  param_dil_w, qs;
+--:-:-:-:1      IADD  x,  x, -param_pad_w;
+// bounds_x = x < 0 || x > W ? -1 : 0
+--:-:-:-:1      ISET.LT.AND x0, x, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW, x,  param_W, PT;
+--:-:-:-:1      LOP.OR bounds_x, x0, xW;
+// doLoad = crst < CRST && bounds_yz == 0 && bounds_x == 0
+--:-:-:-:1      ISETP.EQ.AND P2, PT, bounds_x, RZ, P4;
+// trackI = I + i + x*N
+--:-:-:-:1      XMAD ti, x, param_N, i;
+--:-:-:-:1      LEA      trackI0.CC, ti, param_I[0],     2;
+--:-:-:-:1      LEA.HI.X trackI1,    ti, param_I[1], RZ, 2;
+// trackE = E + e + q*N
+--:-:-:-:1      XMAD te, Q, param_N, e;
+--:-:-:-:1      LEA      trackE0.CC, te, param_E[0],     2;
+--:-:-:-:0      LEA.HI.X trackE1,    te, param_E[1], RZ, 2;
+// q += grid_Q
+--:-:-:-:1      IADD q, q, param_grid_Q;
+--:-:-:-:1      ISETP.LT.AND P5, PT, q, param_Q, PT;
+
+--:-:-:-:1 @!P0 IADD loopN, loopN, param_N;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:6 @!P0 BRA.U NEXT_PQ;
+
+--:-:-:-:0      PSETP.AND.AND P0, PT, PT, PT, !PT;
+
+--:-:1:-:1  @P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>];
+--:-:2:-:1  @P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>];
+--:-:-:-:1 @!P2 LDS.U.128 loadI0, [addr_zero];
+--:-:5:-:1 @!P2 LDS.U.128 loadI4, [addr_zero];
+
+--:-:-:-:0      ISETP.LE.AND P1, PT, loopN, 32, PT;
+
+--:-:3:-:1  @P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>];
+--:-:4:-:1  @P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>];
+--:-:-:-:1 @!P3 LDS.U.128 loadE0, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 loadE4, [addr_zero];
+
+11:-:-:-:1      STS [writeS + 4x< 0*128>], loadI0;
+--:-:-:-:1      STS [writeS + 4x< 1*128>], loadI1;
+--:-:-:-:1      STS [writeS + 4x< 2*128>], loadI2;
+--:-:-:-:1      STS [writeS + 4x< 3*128>], loadI3;
+
+02:-:-:-:1      STS [writeS + 4x< 8*128 + 16>], loadI4;
+--:-:-:-:1      STS [writeS + 4x< 9*128 + 16>], loadI5;
+--:-:-:-:1      STS [writeS + 4x<10*128 + 16>], loadI6;
+--:-:-:-:1      STS [writeS + 4x<11*128 + 16>], loadI7;
+
+--:-:-:-:1      IADD   trackI0.CC, trackI0, 4x<16>;
+--:-:-:-:0      PSETP.AND.AND P5, PT, P1, P5, PT;
+
+24:-:-:-:1      STS [writeS + 4x< 0*128 + szBuf>], loadE0;
+--:-:-:-:1      STS [writeS + 4x< 1*128 + szBuf>], loadE1;
+--:-:-:-:1      STS [writeS + 4x< 2*128 + szBuf>], loadE2;
+--:-:-:-:1      STS [writeS + 4x< 3*128 + szBuf>], loadE3;
+
+--:-:-:-:0      PSETP.AND.AND P6, PT, P1, P6, PT;
+
+08:-:-:-:1      STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4;
+--:-:-:-:1      STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5;
+--:-:-:-:1      STS [writeS + 4x<10*128 + szBuf + 16>], loadE6;
+--:1:-:-:1      STS [writeS + 4x<11*128 + szBuf + 16>], loadE7;
+
+--:-:-:-:1      IADD.X trackI1, trackI1, RZ;
+
+--:-:-:-:1      IADD   trackE0.CC, trackE0, 4x<16>;
+
+--:-:-:-:1      IADD readEs,  readEs, -swapBuf;
+--:-:-:-:0      IADD readIs,  readIs, -swapBuf;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackE1, trackE1, RZ;
+
+--:-:2:-:1  @P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>];
+--:5:2:-:1  @P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>];
+--:-:3:-:1  @P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>];
+--:6:3:-:1  @P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>];
+
+10:-:-:-:6  @P2 IADD   trackI0.CC, trackI0, 4x<16>;
+--:-:-:-:1  @P2 IADD.X trackI1, trackI1, RZ;
+20:-:-:-:6  @P3 IADD   trackE0.CC, trackE0, 4x<16>;
+--:-:-:-:0  @P3 IADD.X trackE1, trackE1, RZ;
+
+--:-:-:Y:5  @P5 BRA.U NEXT_Q;
+--:-:-:Y:5  @P6 BRA.U NEXT_P;
+
+--:-:-:-:2      ISETP.LT.AND P5, PT, q, param_Q, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, p, param_P, PT;
+
+NEXT_PQ:
+
+--:-:1:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*128 + 64>];
+--:-:1:-:2      LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>];
+
+
+// P0 loop N
+// P2 bounds I
+// P3 bounds E
+// P4 bounds yz
+// P5 loop Q
+// P6 loop P
+
+//loop = N >= 16 && (N >= 32 || (!p5 && !p6))
+
+NEXT_16N:
+
+<CODE>
+
+    my %insert =
+    (
+        j0c8   => "--:-:-:-:1      IADD loopN, loopN, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, loopN, 16, PT;\n",
+
+        j4c8   => "02:-:-:-:1  \@P0 STS [writeS + 4x< 0*128>], loadI0;\n",
+        j4c10  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 1*128>], loadI1;\n",
+        j4c12  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 2*128>], loadI2;\n",
+        j4c14  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 3*128>], loadI3;\n",
+
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeS + 4x< 8*128 + 16>], loadI4;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 9*128 + 16>], loadI5;\n",
+        j5c12  => "--:-:-:-:1  \@P0 STS [writeS + 4x<10*128 + 16>], loadI6;\n",
+        j5c14  => "--:2:-:-:1  \@P0 STS [writeS + 4x<11*128 + 16>], loadI7;\n",
+
+        j5c16  => "--:-:-:-:1      ISETP.GE.AND P2, PT, loopN, 32, P2;\n",
+
+        j5c60  => "02:-:2:-:1  \@P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>];\n",
+        j5c62  => "--:4:2:-:1  \@P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>];\n",
+
+        j6c16  => "--:-:-:-:1 \@!P2 LDS.U.128 loadI0, [addr_zero];\n",
+        j7c16  => "--:-:-:-:1 \@!P2 LDS.U.128 loadI4, [addr_zero];\n",
+
+        j10c57 => "08:-:-:-:1  \@P2 IADD   trackI0.CC, trackI0, 4x<16>;\n",
+        j10c62 => "--:-:-:-:1  \@P2 IADD.X trackI1,    trackI1, RZ;\n",
+
+        j12c8  => "04:-:-:-:1  \@P0 STS [writeS + 4x< 0*128 + szBuf>], loadE0;\n",
+        j12c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 1*128 + szBuf>], loadE1;\n",
+        j12c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 2*128 + szBuf>], loadE2;\n",
+        j12c14 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 3*128 + szBuf>], loadE3;\n",
+
+        j13c8  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4;\n",
+        j13c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5;\n",
+        j13c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6;\n",
+        j13c14 => "--:3:-:-:1  \@P0 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7;\n",
+
+        j13c16 => "--:-:-:-:1      ISETP.GE.AND P3, PT, loopN, 32, P3;\n",
+
+        j13c60 => "04:-:3:-:1  \@P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>];\n",
+        j13c62 => "--:4:3:-:1  \@P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>];\n",
+
+        j14c16 => "--:-:-:-:1 \@!P3 LDS.U.128 loadE0, [addr_zero];\n",
+        j15c16 => "--:-:-:-:1 \@!P3 LDS.U.128 loadE4, [addr_zero];\n",
+
+        j15c57 => "08:-:-:-:1  \@P3 IADD   trackE0.CC, trackE0, 4x<16>;\n",
+        j15c62 => "--:-:-:-:1  \@P3 IADD.X trackE1,    trackE1, RZ;\n",
+
+        j14c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "20:-:-:-:1  \@P0 IADD readEs, readEs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j15c24 => "--:-:-:-:1      ISETP.GT.AND P1, PT, loopN, 32, PT;\n",
+        j15c37 => "--:-:-:-:1      PSETP.AND.OR P1, PT, !P5, !P6, P1;\n",
+        j15c50 => "--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, PT;\n",
+
+        j15c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_16N;\n" .
+                  "01:-:-:Y:5  \@P5 BRA.U NEXT_Q;\n" .
+                  "--:-:-:Y:5  \@P6 BRA.U NEXT_P;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 15)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = 1 - $odd;
+        my $rsOffset = ($j + 1) & 15;
+        my $rsPred   = $j == 15 ? '@P0' : '   ';
+        my $shift    = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2;
+        my $barrier  = $j == 14 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID2;
+--:-:2:-:1      S2R blkI,    SR_CTAID.Y;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Z;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.X;
+--:-:-:-:5      BRA.U END_CTAID2;
+CTAID2:
+--:-:2:-:1      S2R blkI,    SR_CTAID.X;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Y;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.Z;
+END_CTAID2:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readEs,  readEs, -4x<szBuf>;
+--:-:-:-:1  @P0 IADD readIs,  readIs, -swapBuf;
+--:-:-:-:1  @P0 IADD readEs,  readEs, -swapBuf;
+
+// writeCs = (readIs / 4) * 128 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 5;
+
+--:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      LOP.AND tid96,  tid,  96;
+01:-:-:-:1      LOP.AND t128,   tid, 128;
+
+// kk = tid31 | (t128 >> 2);
+--:-:-:-:1      SHR.U32  kk, t128, 2;
+--:-:-:-:1      LOP.OR   kk, tid31,  kk;
+
+// readCs = ((tid96 << 4) | kk) << 2;
+--:-:-:-:1      SHL      readCs, tid96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, kk;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// kk += blkE*128;
+04:-:-:-:1      ISCADD kk, blkE, kk, 7;
+--:-:-:-:1      IADD   kk, kk, param_offset_K;
+
+// crst = blkI*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 crst00, tid96, 1;
+02:-:-:-:1      ISCADD  crst00, blkI, crst00, 7;
+--:-:-:-:1      IADD    crst04, crst00,  4;
+--:-:-:-:1      IADD    crst08, crst00,  8;
+--:-:-:-:1      IADD    crst12, crst00,  12;
+
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1, K, 2;
+--:-:-:-:1      SHL K4, K, 4;
+--:-:-:-:1      ISCADD K60, K, -K4, 8;
+
+// trackF += crst*K + k;
+--:-:-:-:1      VMAD.U16.U16 tf, crst00, K, kk;
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSTK, param_CRSTK;
+08:-:-:-:1      XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ;
+        };
+    }
+    return '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0],     0x2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 0x2;
+
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P5, PT, kk, param_K, PT;
+--:-:-:-:1      IADD kk, kk, 64;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, PT;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   track04F0.CC, track00F0, K4;
+--:-:-:-:1      IADD.X track04F1,    track00F1, RZ;
+--:-:-:-:6      IADD   track08F0.CC, track04F0, K4;
+--:-:-:-:1      IADD.X track08F1,    track04F1, RZ;
+--:-:-:-:6      IADD   track12F0.CC, track08F0, K4;
+--:-:-:-:0      IADD.X track12F1,    track08F1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   track00F0.CC, track00F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst00,       crst00,     60;\n" .
+            "--:-:-:-:1      IADD.X track00F1,    track00F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track04F0.CC, track04F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst04,       crst04,     60;\n" .
+            "--:-:-:-:1      IADD.X track04F1,    track04F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track08F0.CC, track08F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst08,       crst08,     60;\n" .
+            "--:-:-:-:1      IADD.X track08F1,    track08F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track12F0.CC, track12F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst12,       crst12,     60;\n" .
+            "--:-:-:-:1      IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
+--:-:-:-:1      IADD         crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
+--:-:-:-:1      IADD         crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
+--:-:-:-:1      IADD         crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
+--:-:-:-:0      IADD         crst12, crst12, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], c4;
+
+--:-:1:-:1      LDS c0, [readCs + 4x<0*128 + 00>];
+--:-:2:-:1      LDS c2, [readCs + 4x<1*128 + 00>];
+--:-:3:-:1      LDS c4, [readCs + 4x<2*128 + 00>];
+--:-:4:-:a      LDS c6, [readCs + 4x<3*128 + 00>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:-:-:-:1  @P0 STG.E.CG [track00F], c0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 STG.E.CG [track04F], c2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 STG.E.CG [track08F], c4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 STG.E.CG [track12F], c6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
+    else
+    {
+        return q{
+01:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F], c0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F], c2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F], c4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F], c6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
++]
+
+--:-:1:-:1      LDS c1, [readCs + 4x<0*128 + 64>];
+--:-:2:-:1      LDS c3, [readCs + 4x<1*128 + 64>];
+--:-:3:-:1      LDS c5, [readCs + 4x<2*128 + 64>];
+--:-:4:-:a      LDS c7, [readCs + 4x<3*128 + 64>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:1:-:-:1  @P0 STG.E.CG [track00F + 4x<64>], c1;
+02:2:-:-:1  @P1 STG.E.CG [track04F + 4x<64>], c3;
+04:3:-:-:1  @P2 STG.E.CG [track08F + 4x<64>], c5;
+08:4:-:-:1  @P3 STG.E.CG [track12F + 4x<64>], c7;
+        };
+    }
+    else
+    {
+        return q{
+01:1:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], c1;
+02:2:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], c3;
+04:3:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], c5;
+08:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], c7;
+        };
+    }
++]
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Maxwell/sconv_updat_C128_K64.sass b/Kernel/Convolution/Maxwell/sconv_updat_C128_K64.sass
new file mode 100644
index 0000000..26cc64c
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/sconv_updat_C128_K64.sass
@@ -0,0 +1,818 @@
+# Kernel: sconv_updat_C128_K64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + (64*16 + 32)*2>
+    szShareI  : (128*16 + 32)
+    szShareE  : (64*16  + 32)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_offset_K     : c[0x0][0x15c]
+    param_N            : c[0x0][0x160]
+    param_K            : c[0x0][0x164]
+    param_D            : c[0x0][0x168]
+    param_H            : c[0x0][0x16c]
+    param_W            : c[0x0][0x170]
+    param_WN           : c[0x0][0x174]
+    param_HWN          : c[0x0][0x178]
+    param_DHWN         : c[0x0][0x17c]
+    param_C            : c[0x0][0x180]
+    param_CRST         : c[0x0][0x184]
+    param_RST          : c[0x0][0x188]
+    param_magic_RST    : c[0x0][0x18c]
+    param_shift_RST    : c[0x0][0x190]
+    param_RS           : c[0x0][0x194]
+    param_magic_RS     : c[0x0][0x198]
+    param_shift_RS     : c[0x0][0x19c]
+    param_S            : c[0x0][0x1a0]
+    param_magic_S      : c[0x0][0x1a4]
+    param_shift_S      : c[0x0][0x1a8]
+    param_pad_d        : c[0x0][0x1ac]
+    param_pad_h        : c[0x0][0x1b0]
+    param_pad_w        : c[0x0][0x1b4]
+    param_str_d        : c[0x0][0x1b8]
+    param_str_h        : c[0x0][0x1bc]
+    param_str_w        : c[0x0][0x1c0]
+    param_dil_d        : c[0x0][0x1c4]
+    param_dil_h        : c[0x0][0x1c8]
+    param_dil_w        : c[0x0][0x1cc]
+    param_P            : c[0x0][0x1d0]
+    param_Q            : c[0x0][0x1d4]
+    param_PQ           : c[0x0][0x1d8]
+    param_QN           : c[0x0][0x1dc]
+    param_PQN          : c[0x0][0x1e0]
+    param_MPQN         : c[0x0][0x1e4]
+    param_magic_Q      : c[0x0][0x1e8]
+    param_shift_Q      : c[0x0][0x1ec]
+    param_magic_PQ     : c[0x0][0x1f0]
+    param_shift_PQ     : c[0x0][0x1f4]
+    param_grid_P       : c[0x0][0x1f8]
+    param_grid_Q       : c[0x0][0x1fc]
+    param_grid_PQ      : c[0x0][0x200]
+    param_CRSTK        : c[0x0][0x204]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-67   ~ tid, blkI, blkE, one
+    68-99   ~ blkMPQ, tidX, tid1, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+    64-72   ~ c<0-1>, z<0-1>, y<0-1>, x<0-1>, Q
+    73-99   ~ mt, pr, qs, r<0-1>, s<0-1>, t<0-1>, rst<0-1>, rs<0-1>
+    73-99   ~ te, ti<0-1>, xw<0-1>, xW<0-1>, yh<0-1>, yH<0-1>, zd<0-1>, zD<0-1>, cC<0-1>, nextP, nextQ
+
+    64-79   : j0Ex<0-7>, j0Iy<0-7>
+    80-95   : j1Ex<0-7>, j1Iy<0-7>
+
+    100-147 : load0I<00-15>, load1I<00-15>, loadE<00-15>
+    148-153 : track0I<0-1>,  track1I<0-1>,  trackE<0-1>
+
+    154-164 ~ writeIs, writeEs, loopN, m, p, q, qq, k, crst<0-1>, tidY
+    165-167 ~ readIs, readEs, swapBuf
+
+     68-83  : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+     84-164 ~ K, K4, K1, K60, tid31, tid96, kk, tf, writeCs, readCs, crst<00|04|08|12>, alpha, blk_MPQ, CRSTK, xmad_determ
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID1;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkI,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+--:-:-:-:5      BRA.U END_CTAID1;
+CTAID1:
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.Z;
+--:-:3:-:1      S2R blkI,   SR_CTAID.X;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Y;
+END_CTAID1:
+
+<SCHEDULE_BLOCK>
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 2
+// shiftX = (tid & 1) << 4
+01:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tid1, 2;
+--:-:-:-:1      SHL     shiftX, tid1, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_grid_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_grid_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+--:-:-:-:1      MOV qq, q;
+
+// writeIs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeIs, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeIs, writeIs, shiftX;
+--:-:-:-:1      ISCADD writeIs, writeIs, 4x<szShareI + szShareE>, 2;
+
+// writeEs = (tidY*64 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeEs, tidY, tidX, 6;
+--:-:-:-:1      IADD   writeEs, writeEs, shiftX;
+--:-:-:-:1      ISCADD writeEs, writeEs, 4x<szShareI*2 + szShareE>, 2;
+
+// readIs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readIs, tid,   -16;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<szShareI>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareI + szShareE>;
+
+// crst = blockI*128 + tid
+04:-:-:-:1      ISCADD crst0, blkI, tidX, 7;
+--:-:-:-:1      IADD   crst1, crst0, 64;
+
+// k = blockE*64 + tid
+08:-:-:-:1      ISCADD k, blkE, tidX, 6;
+--:-:-:-:1      IADD   k, k, param_offset_K;
+
+--:-:-:-:1      MOV loopN, RZ;
+
+--:-:-:-:1      PSETP.AND.AND P0, PT, PT, PT, PT;
+</SCHEDULE_BLOCK>
+
+NEXT_PQ:
+
+<SCHEDULE_BLOCK>
+// Zigzag q but only if grid_P < P
+--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
+--:-:-:-:1      MOV Q, param_grid_P;
+--:-:-:-:1      ISETP.LT.AND P1, PT, Q, param_P, P1;
+--:-:-:-:1      MOV Q, -1;
+--:-:-:-:1  @P1 IADD3 Q, -q, param_Q, Q;
+--:-:-:-:1 @!P1 MOV Q, q;
+// c   = crst / RST
+// rst = crst % RST
+--:-:-:-:1      XMAD.LO2C  c0, crst0, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32    c0, c0, param_shift_RST;
+--:-:-:-:1      XMAD rst0, c0, param_RST, RZ;
+--:-:-:-:1      IADD rst0, -rst0, crst0;
+--:-:-:-:1      XMAD.LO2C  c1, crst1, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32    c1, c1, param_shift_RST;
+--:-:-:-:1      XMAD rst1, c1, param_RST, RZ;
+--:-:-:-:1      IADD rst1, -rst1, crst1;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C  t0, rst0, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32    t0, t0, param_shift_RS;
+--:-:-:-:1      XMAD  rs0, t0, param_RS, RZ;
+--:-:-:-:1      IADD  rs0, -rs0, rst0;
+--:-:-:-:1      XMAD.LO2C  t1, rst1, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32    t1, t1, param_shift_RS;
+--:-:-:-:1      XMAD  rs1, t1, param_RS, RZ;
+--:-:-:-:1      IADD  rs1, -rs1, rst1;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C  r0, rs0, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32    r0, r0, param_shift_S;
+--:-:-:-:1      XMAD   s0, r0, param_S, RZ;
+--:-:-:-:1      IADD   s0, -s0, rs0;
+--:-:-:-:1      XMAD.LO2C  r1, rs1, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32    r1, r1, param_shift_S;
+--:-:-:-:1      XMAD   s1, r1, param_S, RZ;
+--:-:-:-:1      IADD   s1, -s1, rs1;
+// z = m * w - pad_d + (t * dil_d)
+// y = p * u - pad_h + (r * dil_h)
+// x = q * v - pad_w + (s * dil_w)
+--:-:-:-:1      XMAD  mt, m,   param_str_d, RZ;
+--:-:-:-:1      XMAD  pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD  qs, Q,   param_str_w, RZ;
+--:-:-:-:1      XMAD  z1, t1,  param_dil_d, mt;
+--:-:-:-:1      XMAD  y1, r1,  param_dil_h, pr;
+--:-:-:-:1      XMAD  x1, s1,  param_dil_w, qs;
+--:-:-:-:1      XMAD  z0, t0,  param_dil_d, mt;
+--:-:-:-:1      XMAD  y1, r0,  param_dil_h, pr;
+--:-:-:-:1      XMAD  x1, s0,  param_str_w, qs;
+--:-:-:-:1      IADD  z1, z1, -param_pad_d;
+--:-:-:-:1      IADD  y1, y1, -param_pad_h;
+--:-:-:-:1      IADD  x1, x1, -param_pad_w;
+--:-:-:-:1      IADD  z0, z0, -param_pad_d;
+--:-:-:-:1      IADD  y0, y0, -param_pad_h;
+--:-:-:-:1      IADD  x0, x0, -param_pad_w;
+</SCHEDULE_BLOCK>
+
+// Split blocks to fit inside of 36 registers
+<SCHEDULE_BLOCK>
+// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY
+--:-:-:-:1      XMAD.LO2C ti0, c0, param_DHWN, tidY;
+--:-:-:-:1      XMAD.LO2C ti0, z0, param_HWN,  ti0;
+--:-:-:-:1      XMAD.LO2C ti0, y0, param_WN,   ti0;
+--:-:-:-:1      XMAD      ti0, x0, param_N,    ti0;
+--:-:-:-:1      XMAD.LO2C ti1, c1, param_DHWN, tidY;
+--:-:-:-:1      XMAD.LO2C ti1, z1, param_HWN,  ti1;
+--:-:-:-:1      XMAD.LO2C ti1, y1, param_WN,   ti1;
+--:-:-:-:1      XMAD      ti1, x1, param_N,    ti1;
+--:-:-:-:1      LEA      track0I0.CC, ti0, param_I[0],     2;
+--:-:-:-:1      LEA.HI.X track0I1,    ti0, param_I[1], RZ, 2;
+--:-:-:-:1      LEA      track1I0.CC, ti1, param_I[0],     2;
+--:-:-:-:1      LEA.HI.X track1I1,    ti1, param_I[1], RZ, 2;
+
+// trackE = k*MPQN + m*PQN + p*QN + tidY
+--:-:-:-:1      XMAD.LO2C te, k, param_MPQN, tidY;
+--:-:-:-:1      XMAD.LO2C te, m, param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te, p, param_QN,   te;
+--:-:-:-:1      XMAD      te, Q, param_N,    te;
+--:-:-:-:1      LEA      trackE0.CC, te, param_E[0],     2;
+--:-:-:-:0      LEA.HI.X trackE1,    te, param_E[1], RZ, 2;
+
+// Bounds check x,y,z,c for each I track.
+// If out of bounds, this will set the track address to -1
+--:-:-:-:1      ISET.GE.AND cC0, c0, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd0, z0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD0, z0, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh0, y0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH0, y0, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw0, x0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW0, x0, param_W, PT;
+--:-:-:-:1      LOP.OR   track0I0, track0I0, cC0;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, zd0, zD0, 0xfe;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, yh0, yH0, 0xfe;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, xw0, xW0, 0xfe;
+
+--:-:-:-:1      ISET.GE.AND cC1, c1, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd1, z1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD1, z1, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh1, y1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH1, y1, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw1, x1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW1, x1, param_W, PT;
+--:-:-:-:1      LOP.OR   track1I0, track1I0, cC1;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, zd1, zD1, 0xfe;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, yh1, yH1, 0xfe;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, xw1, xW1, 0xfe;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, PT;
+--:-:-:-:1      ISETP.NE.AND P3, PT, track1I0, -1, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, k, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:-:1      IADD loopN, loopN, param_N;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P0 BRA.U FIRST_LOAD;
+
+INIT_LOOP:
+
+--:-:1:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64  + 00>];
+--:-:1:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64  + 32>];
+--:-:1:-:2      LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>];
+
+NEXT_16N:
+
+<CODE>
+
+    my %insert =
+    (
+        j0c8   => "--:-:-:-:1      IADD loopN, loopN, -16;\n",
+
+        # p0 = (N & 16) == 0
+        # p1 = N >= 32 && p0
+        j0c14  => "--:-:-:-:1      LOP.AND.NZ P0, RZ, loopN, 16;\n",
+        j0c28  => "--:-:-:-:1      ISETP.GE.AND P1, PT, loopN, 32, P0;\n",
+
+
+        j1c8   => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 0*128 +  0 +  0>], load0I08;\n",
+        j1c10  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 1*128 +  0 +  0>], load0I09;\n",
+        j1c12  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 2*128 +  0 +  0>], load0I10;\n",
+        j1c14  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 3*128 +  0 +  0>], load0I11;\n",
+        j1c16  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 8*128 +  0 + 16>], load0I12;\n",
+        j1c18  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 9*128 +  0 + 16>], load0I13;\n",
+        j1c20  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<10*128 +  0 + 16>], load0I14;\n",
+        j1c22  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<11*128 +  0 + 16>], load0I15;\n",
+
+        j2c8   => "02:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 +  0 +  0>], load0I00;\n",
+        j2c10  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 +  0 +  0>], load0I01;\n",
+        j2c12  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 +  0 +  0>], load0I02;\n",
+        j2c14  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 +  0 +  0>], load0I03;\n",
+        j2c16  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 +  0 + 16>], load0I04;\n",
+        j2c18  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 +  0 + 16>], load0I05;\n",
+        j2c20  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 +  0 + 16>], load0I06;\n",
+        j2c22  => "--:2:-:-:1 \@!P0 STS [writeIs + 4x<11*128 +  0 + 16>], load0I07;\n",
+
+        j2c24  => "--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, P1;\n",
+        j2c26  => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track0I0, -1, P1;\n",
+
+        j3c8   => "02:-:-:-:1  \@P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>];\n",
+        j3c10  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>];\n",
+        j3c12  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>];\n",
+        j3c14  => "--:5:2:-:1  \@P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>];\n",
+
+        j4c8   => "--:-:-:-:1  \@P3 LDS.U.128 load0I00, [addr_zero];\n",
+        j4c10  => "--:-:-:-:1  \@P3 LDS.U.128 load0I04, [addr_zero];\n",
+        j5c8   => "--:-:-:-:1  \@P3 LDS.U.128 load0I08, [addr_zero];\n",
+        j5c10  => "--:-:-:-:1  \@P3 LDS.U.128 load0I12, [addr_zero];\n",
+
+        j5c57  => "10:-:-:-:1  \@P2 IADD   track0I0.CC, track0I0, 4x<32>;\n",
+        j5c62  => "--:-:-:-:1  \@P2 IADD.X track0I1,    track0I1, RZ;\n",
+
+        j6c8   => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 0*128 + 64 +  0>], load1I08;\n",
+        j6c10  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 1*128 + 64 +  0>], load1I09;\n",
+        j6c12  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 2*128 + 64 +  0>], load1I10;\n",
+        j6c14  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 3*128 + 64 +  0>], load1I11;\n",
+        j6c16  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I12;\n",
+        j6c18  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I13;\n",
+        j6c20  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I14;\n",
+        j6c22  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I15;\n",
+
+        j7c8   => "04:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 + 64 +  0>], load1I00;\n",
+        j7c10  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 + 64 +  0>], load1I01;\n",
+        j7c12  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 + 64 +  0>], load1I02;\n",
+        j7c14  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 + 64 +  0>], load1I03;\n",
+        j7c16  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04;\n",
+        j7c18  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05;\n",
+        j7c20  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I06;\n",
+        j7c22  => "--:3:-:-:1 \@!P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I07;\n",
+
+        j7c24  => "--:-:-:-:1      ISETP.NE.AND P2, PT, track1I0, -1, P1;\n",
+        j7c26  => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track1I0, -1, P1;\n",
+
+        j8c8   => "04:-:-:-:1  \@P2 LDG.E.CI.128 load1I00, [track1I + 4x< 0>];\n",
+        j8c10  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load1I04, [track1I + 4x< 8>];\n",
+        j8c12  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load1I08, [track1I + 4x<16>];\n",
+        j8c14  => "--:5:3:-:1  \@P2 LDG.E.CI.128 load1I12, [track1I + 4x<24>];\n",
+
+        j9c8   => "--:-:-:-:1  \@P3 LDS.U.128 load1I00, [addr_zero];\n",
+        j9c10  => "--:-:-:-:1  \@P3 LDS.U.128 load1I04, [addr_zero];\n",
+        j10c8  => "--:-:-:-:1  \@P3 LDS.U.128 load1I08, [addr_zero];\n",
+        j10c10 => "--:-:-:-:1  \@P3 LDS.U.128 load1I12, [addr_zero];\n",
+
+        j10c57 => "10:-:-:-:1  \@P2 IADD   track1I0.CC, track1I0, 4x<32>;\n",
+        j10c62 => "--:-:-:-:1  \@P2 IADD.X track1I1,    track1I1, RZ;\n",
+
+
+        j11c8   => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 0*64 +  0>], loadE08;\n",
+        j11c10  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 1*64 +  0>], loadE09;\n",
+        j11c12  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 2*64 +  0>], loadE10;\n",
+        j11c14  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 3*64 +  0>], loadE11;\n",
+        j11c16  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 8*64 + 16>], loadE12;\n",
+        j11c18  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 9*64 + 16>], loadE13;\n",
+        j11c20  => "--:-:-:-:1  \@P0 STS [writeEs + 4x<10*64 + 16>], loadE14;\n",
+        j11c22  => "--:-:-:-:1  \@P0 STS [writeEs + 4x<11*64 + 16>], loadE15;\n",
+
+        j12c8   => "08:-:-:-:1 \@!P0 STS [writeEs + 4x< 0*64 +  0>], loadE00;\n",
+        j12c10  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 1*64 +  0>], loadE01;\n",
+        j12c12  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 2*64 +  0>], loadE02;\n",
+        j12c14  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 3*64 +  0>], loadE03;\n",
+        j12c16  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 8*64 + 16>], loadE04;\n",
+        j12c18  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 9*64 + 16>], loadE05;\n",
+        j12c20  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x<10*64 + 16>], loadE06;\n",
+        j12c22  => "--:4:-:-:1 \@!P0 STS [writeEs + 4x<11*64 + 16>], loadE07;\n",
+
+        j12c24  => "--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K,  P1;\n",
+
+        j13c8   => "08:-:-:-:1  \@P2 LDG.E.CI.128 loadE00, [trackE + 4x< 0>];\n",
+        j13c10  => "--:-:-:-:1  \@P2 LDG.E.CI.128 loadE04, [trackE + 4x< 8>];\n",
+        j13c12  => "--:-:-:-:1  \@P2 LDG.E.CI.128 loadE08, [trackE + 4x<16>];\n",
+        j13c14  => "--:5:4:-:1  \@P2 LDG.E.CI.128 loadE12, [trackE + 4x<24>];\n",
+
+        j15c57  => "10:-:-:-:1  \@P2 IADD   trackE0.CC, trackE0, 4x<32>;\n",
+        j15c62  => "--:-:-:-:1  \@P2 IADD.X trackE1,    trackE1, RZ;\n",
+
+        # p0 = N >= 16 and not (N == 32 and (p or q))
+        j14c8   => "--:-:-:-:1      ISETP.EQ.AND  P0, PT, loopN, 32, PT;\n",
+        j14c10  => "--:-:-:-:1      ISETP.GE.AND  P1, PT, loopN, 16, PT;\n",
+        j14c22  => "--:-:-:-:1      PSETP.OR.AND  P0, PT, P5, P6, P0;\n",
+        j14c35  => "--:-:-:-:1      PSETP.AND.AND P0, PT, !P0, P1, PT;\n",
+
+        j14c63 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "20:-:-:-:1      IADD readEs,  readEs, -swapBuf;\n" .
+                  "--:-:-:-:1      IADD readIs,  readIs, -swapBuf;\n" .
+                  "--:-:-:-:1      IADD writeEs, writeEs, swapBuf;\n" .
+                  "--:-:-:-:1      IADD writeIs, writeIs, swapBuf;\n" .
+                  "--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j15c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_16N;\n" .
+                  "--:-:-:-:0  \@P5 IADD q, q, param_grid_Q;\n" .
+                  "01:-:-:Y:5  \@P5 BRA.U NEXT_PQ;\n" .
+                  "--:-:-:-:1  \@P6 MOV  q, qq;\n" .
+                  "--:-:-:-:0  \@P6 IADD p, p, param_grid_P;\n" .
+                  "--:-:-:Y:5  \@P6 BRA.U NEXT_PQ;\n" .
+                  "--:-:-:Y:5      BRA.U FINISH;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 15)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = 1 - $odd;
+        my $rsOffset = ($j + 1) & 15;
+        my $rsPred   = $j == 15 ? '@P0' : '   ';
+        my $shift    = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2;
+        my $barrier  = $j == 14 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64  + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64  + 32 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+FIRST_LOAD:
+
+--:-:-:-:0      PSETP.AND.AND P0, PT, PT, PT, !PT;
+
+--:-:-:-:1  @P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>];
+--:-:-:-:1  @P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>];
+--:-:-:-:1  @P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>];
+--:-:1:-:1  @P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>];
+--:-:-:-:1 @!P2 LDS.U.128    load0I00, [addr_zero];
+--:-:-:-:1 @!P2 LDS.U.128    load0I04, [addr_zero];
+--:-:-:-:1 @!P2 LDS.U.128    load0I08, [addr_zero];
+--:-:4:-:1 @!P2 LDS.U.128    load0I12, [addr_zero];
+
+// p1 = N == 32 and (p or q)
+--:-:-:-:0      ISETP.EQ.AND  P1, PT, loopN, 32, PT;
+
+--:-:-:-:1  @P3 LDG.E.CI.128 load1I00, [track1I + 4x< 0>];
+--:-:-:-:1  @P3 LDG.E.CI.128 load1I04, [track1I + 4x< 8>];
+--:-:-:-:1  @P3 LDG.E.CI.128 load1I08, [track1I + 4x<16>];
+--:-:2:-:1  @P3 LDG.E.CI.128 load1I12, [track1I + 4x<24>];
+--:-:-:-:1 @!P3 LDS.U.128    load1I00, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.128    load1I04, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.128    load1I08, [addr_zero];
+--:-:5:-:1 @!P3 LDS.U.128    load1I12, [addr_zero];
+
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE00, [trackE + 4x< 0>];
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE04, [trackE + 4x< 8>];
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE08, [trackE + 4x<16>];
+--:-:3:-:1  @P4 LDG.E.CI.128 loadE12, [trackE + 4x<24>];
+--:-:-:-:1 @!P4 LDS.U.128    loadE00, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128    loadE04, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128    loadE08, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128    loadE12, [addr_zero];
+
+--:-:-:-:0      PSETP.OR.AND  P1, PT, P5, P6, P1;
+
+09:-:-:-:1      STS [writeIs + 4x< 0*128 +  0 +  0>], load0I00;
+--:-:-:-:1      STS [writeIs + 4x< 1*128 +  0 +  0>], load0I01;
+--:-:-:-:1      STS [writeIs + 4x< 2*128 +  0 +  0>], load0I02;
+--:-:-:-:1      STS [writeIs + 4x< 3*128 +  0 +  0>], load0I03;
+--:-:-:-:1      STS [writeIs + 4x< 8*128 +  0 + 16>], load0I04;
+--:-:-:-:1      STS [writeIs + 4x< 9*128 +  0 + 16>], load0I05;
+--:-:-:-:1      STS [writeIs + 4x<10*128 +  0 + 16>], load0I06;
+--:-:-:-:1      STS [writeIs + 4x<11*128 +  0 + 16>], load0I07;
+
+--:-:-:-:6  @P2 IADD   track0I0.CC, track0I0, 4x<32>;
+--:-:-:-:0  @P2 IADD.X track0I1,    track0I1, RZ;
+
+12:-:-:-:1      STS [writeIs + 4x< 0*128 + 64 +  0>], load1I00;
+--:-:-:-:1      STS [writeIs + 4x< 1*128 + 64 +  0>], load1I01;
+--:-:-:-:1      STS [writeIs + 4x< 2*128 + 64 +  0>], load1I02;
+--:-:-:-:1      STS [writeIs + 4x< 3*128 + 64 +  0>], load1I03;
+--:-:-:-:1      STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04;
+--:-:-:-:1      STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05;
+--:-:-:-:1      STS [writeIs + 4x<10*128 + 64 + 16>], load1I06;
+--:-:-:-:1      STS [writeIs + 4x<11*128 + 64 + 16>], load1I07;
+
+--:-:-:-:3  @P3 IADD   track1I0.CC, track1I0, 4x<32>;
+--:-:-:-:2      PSETP.AND.AND P5, PT, P5, P1, PT;
+--:-:-:-:1      PSETP.AND.AND P6, PT, P6, P1, PT;
+--:-:-:-:0  @P3 IADD.X track1I1,    track1I1, RZ;
+
+24:-:-:-:1      STS [writeEs + 4x< 0*64 +  0>], loadE00;
+--:-:-:-:1      STS [writeEs + 4x< 1*64 +  0>], loadE01;
+--:-:-:-:1      STS [writeEs + 4x< 2*64 +  0>], loadE02;
+--:-:-:-:1      STS [writeEs + 4x< 3*64 +  0>], loadE03;
+--:-:-:-:1      STS [writeEs + 4x< 8*64 + 16>], loadE04;
+--:-:-:-:1      STS [writeEs + 4x< 9*64 + 16>], loadE05;
+--:-:-:-:1      STS [writeEs + 4x<10*64 + 16>], loadE06;
+--:1:-:-:1      STS [writeEs + 4x<11*64 + 16>], loadE07;
+
+--:-:-:-:6  @P4 IADD   trackE0.CC, trackE0, 4x<32>;
+--:-:-:-:1  @P4 IADD.X trackE1,    trackE1, RZ;
+
+--:-:-:-:1      IADD readEs,  readEs, -swapBuf;
+--:-:-:-:0      IADD readIs,  readIs, -swapBuf;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:0  @P5 IADD q, q, param_grid_Q;
+--:-:-:Y:5  @P5 BRA.U NEXT_PQ;
+--:-:-:-:0  @P6 IADD p, p, param_grid_P;
+--:-:-:Y:5  @P6 BRA.U NEXT_PQ;
+
+--:-:-:-:2      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:Y:5      BRA.U INIT_LOOP;
+
+
+FINISH:
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID2;
+--:-:2:-:1      S2R blkI,    SR_CTAID.Y;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Z;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.X;
+--:-:-:-:5      BRA.U END_CTAID2;
+CTAID2:
+--:-:2:-:1      S2R blkI,    SR_CTAID.X;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Y;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.Z;
+END_CTAID2:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readEs,  readEs, -4x<szShareI>;
+--:-:-:-:1  @P0 IADD readIs,  readIs, -swapBuf;
+--:-:-:-:1  @P0 IADD readEs,  readEs, -swapBuf;
+
+// writeCs = (readIs / 4) * 64 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 4;
+
+
+// readCs = ((tid & 96) << 3) | (tid & 31)
+01:-:-:-:1      LOP.AND tid31, tid, 31;
+01:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 3;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+
+// kk = blkE*64 + tid31;
+04:-:-:-:1      ISCADD kk, blkE, tid31, 6;
+--:-:-:-:1      IADD   kk, kk, param_offset_K;
+
+
+// crst = blkI*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 crst00, tid96,  1;
+02:-:-:-:1      ISCADD  crst00, blkI,   crst00, 7;
+--:-:-:-:1      IADD    crst04, crst00, 4;
+--:-:-:-:1      IADD    crst08, crst00, 8;
+--:-:-:-:1      IADD    crst12, crst00, 12;
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1, K, 2;
+--:-:-:-:1      SHL K4, K, 4;
+--:-:-:-:1      ISCADD K60, K, -K4, 8;
+
+// trackF += crst*K + k;
+--:-:-:-:1      VMAD.U16.U16 tf, crst00, K, kk;
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSTK, param_CRSTK;
+08:-:-:-:1      XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ;
+        };
+    }
+    return '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0], 0x2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 0x2;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P5, PT, kk, param_K, PT;
+--:-:-:-:1      IADD kk, kk, 32;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, PT;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   track04F0.CC, track00F0, K4;
+--:-:-:-:1      IADD.X track04F1,    track00F1, RZ;
+--:-:-:-:6      IADD   track08F0.CC, track04F0, K4;
+--:-:-:-:1      IADD.X track08F1,    track04F1, RZ;
+--:-:-:-:6      IADD   track12F0.CC, track08F0, K4;
+--:-:-:-:1      IADD.X track12F1,    track08F1, RZ;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   track00F0.CC, track00F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst00,       crst00,     60;\n" .
+            "--:-:-:-:1      IADD.X track00F1,    track00F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track04F0.CC, track04F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst04,       crst04,     60;\n" .
+            "--:-:-:-:1      IADD.X track04F1,    track04F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track08F0.CC, track08F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst08,       crst08,     60;\n" .
+            "--:-:-:-:1      IADD.X track08F1,    track08F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track12F0.CC, track12F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst12,       crst12,     60;\n" .
+            "--:-:-:-:1      IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL f0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL f7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
+--:-:-:-:1      IADD         crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
+--:-:-:-:1      IADD         crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
+--:-:-:-:1      IADD         crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
+--:-:-:-:0      IADD         crst12, crst12, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], f0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], f4;
+
+--:-:1:-:1      LDS f0, [readCs + 4x<0*64 + 00>];
+--:-:2:-:1      LDS f2, [readCs + 4x<1*64 + 00>];
+--:-:3:-:1      LDS f4, [readCs + 4x<2*64 + 00>];
+--:-:4:-:1      LDS f6, [readCs + 4x<3*64 + 00>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:-:-:-:1  @P0 STG.E.CG [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 STG.E.CG [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 STG.E.CG [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 STG.E.CG [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
+    else
+    {
+        return q{
+01:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
++]
+
+--:-:1:-:1      LDS f1, [readCs + 4x<0*64 + 32>];
+--:-:2:-:1      LDS f3, [readCs + 4x<1*64 + 32>];
+--:-:3:-:1      LDS f5, [readCs + 4x<2*64 + 32>];
+--:-:4:-:1      LDS f7, [readCs + 4x<3*64 + 32>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:1:-:-:1  @P0 STG.E.CG [track00F + 4x<32>], f1;
+02:2:-:-:1  @P1 STG.E.CG [track04F + 4x<32>], f3;
+04:3:-:-:1  @P2 STG.E.CG [track08F + 4x<32>], f5;
+08:4:-:-:1  @P3 STG.E.CG [track12F + 4x<32>], f7;
+        };
+    }
+    else
+    {
+        return q{
+01:1:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<32>], f1;
+02:2:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<32>], f3;
+04:3:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<32>], f5;
+08:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<32>], f7;
+        };
+    }
++]
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X128_N128.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X128_N128.sass
new file mode 100644
index 0000000..8f91aba
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/sconv_xprop_X128_N128.sass
@@ -0,0 +1,233 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 128;
+    our $shareF = 128;
+    our $stepI  = 32;
+    our $stepF  = 64;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*8*2 + 128*8*2 + 0>
+    szShareF  : (128*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<128*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<128*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<128*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<128*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<128*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<128*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<128*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<128*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne
+     72-111 ~ tid1, tid128, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-107 : loadI<0-3>,  loadF<0-3>
+
+    108-111 ~ offsetF, offsetI, offsetFc, offsetIc
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    114-122 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset
+    123-127 ~ readFs, readIs, tid, idx_N
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-122  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidX = (tid & 31) << 2
+// tidY = tid >> 5
+--:-:-:-:1      LOP.AND tidX, tid,  31;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  5;
+
+// trackF += blkF*128 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 7;
+
+// trackI += blkI*128 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 7;
+
+// writeS = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+// readFs = ((tid & 112) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    112;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7)
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      SHR.U32 tid128, tid128, 3;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid128;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.128    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128    loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.128    loadI, [addr_zero];
+
+12:-:-:-:1      STS.128 [writeS], loadF;
+24:1:-:-:1      STS.128 [writeS + 4x<szShareF>], loadI;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF, [trackF];
+--:-:3:-:1  @P1 LDG.E.128    loadI, [trackI];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c40 => "02:2:-:-:1  \@P0 STS.128 [writeS], loadF;\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "20:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "02:-:2:-:1  \@P1 LDG.E.CI.128 loadF, [trackF];\n",
+
+
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<szShareF>], loadI;\n",
+
+        j6c54 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j6c59 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j6c61 => "04:-:3:-:1  \@P1 LDG.E.128 loadI, [trackI];\n",
+
+        j6c62 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<szShareF + szShareI>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<szShareF + szShareI>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2 + (tid & 128) >> 1
+// tidOY = (tid & 127) >> 3
+--:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      SHL     tidOX,  tidOX,  2;
+--:-:-:-:1      LOP.AND tidOX2, tid,    128;
+--:-:-:-:1      SHR.U32 tidOX2, tidOX2, 1;
+--:-:-:-:1      LOP.OR  tidOX,  tidOX,  tidOX2;
+--:-:-:-:1      LOP.AND tidOY,  tid,    127;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      LOP.AND readIs, readIs, 0x1ff;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x0ff;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+--:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*128 + tidOY * 4
+--:-:-:-:1      SHL tidOY, tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 7;
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X128_N64.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X128_N64.sass
new file mode 100644
index 0000000..d7bd0a1
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/sconv_xprop_X128_N64.sass
@@ -0,0 +1,246 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 64;
+    our $shareF = 128;
+    our $stepI  = 32;
+    our $stepF  = 64;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 128*8*2 + 0>
+    szShareF  : (128*8)
+    szShareI  : (64*8)
+
+    addr_zero  : 4x<64*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-111 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-111 : loadI<0-3>, loadF<0-7>
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    104-107 ~ offsetF, offsetIc, offsetFc
+
+    114-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI
+    125-127 ~ readFs, readIs, swapBuf
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-124  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidX = (tid & 15) << 2
+// tidY = tid >> 4
+--:-:-:-:1      LOP.AND tidX, tid,  15;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  4;
+
+// trackF += blkF*128 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 7;
+
+// trackI += blkI*64 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 6;
+
+// writeFs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// writeIs = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidX, 6;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs = ((tid & -16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = (tid >> 1) & 7
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>];
+--:-:3:-:1  @P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>];
+--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero];
+--:-:5:-:1 @!P1 LDS.U.128 loadF4, [addr_zero];
+
+--:-:4:-:1  @P1 LDG.E.128 loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero];
+
+12:-:-:-:1      STS.128 [writeFs + 4x<00>], loadF0;
+04:-:-:-:1      STS.128 [writeFs + 4x<64>], loadF4;
+
+28:1:-:-:1      STS.128 [writeIs], loadI;
+
+[+ loop_setup() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>];
+--:5:3:-:1  @P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>];
+--:-:4:-:1  @P1 LDG.E.128 loadI, [trackI];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c40 => "02:2:-:-:1  \@P0 STS.128 [writeFs + 4x<00>], loadF0;\n",
+
+        j2c10 => "02:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "20:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "10:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>];\n",
+
+        j4c8  => "04:3:-:-:1  \@P0 STS.128 [writeFs + 4x<64>], loadF4;\n",
+
+        j4c60 => "04:5:3:-:1  \@P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>];\n",
+
+        j6c8  => "08:4:-:-:1  \@P0 STS.128 [writeIs], loadI;\n",
+
+        j6c55 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j6c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j6c62 => "08:5:4:-:1  \@P1 LDG.E.128 loadI, [trackI];\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "--:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2
+// tidOY = tid >> 3
+02:-:-:-:1      LOP.AND tidOX,  tid,   7;
+--:-:-:-:1      SHL     tidOX,  tidOX, 2;
+--:-:-:-:1      SHR.U32 tidOY,  tid,   3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 64 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs  = 4 * (tidOX + (tidOY * 64))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*128 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 7;
+
+[+ output_setup(63, 0, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X32_N128.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X32_N128.sass
new file mode 100644
index 0000000..568e714
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/sconv_xprop_X32_N128.sass
@@ -0,0 +1,262 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 128;
+    our $shareF = 32;
+    our $stepI  = 32;
+    our $stepF  = 16;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<32*8*2 + 128*8*2 + 0>
+    szShareF  : (32*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<32*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<32*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<32*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<32*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<32*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<32*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<32*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<32*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-69 : m, p, q
+      64-69 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne
+     70-113 ~ tid1, tid32, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     70-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-119 : loadI<00-15>,  loadF<0-3>
+
+    120-121 : sliceI, sliceF
+    120-121 : sliceIF<0-1>
+
+    122-140 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetF, offsetIc, offsetFc
+    141-155 ~ readFs, readIs, swapBuf, tid, idx_N
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-140  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+
+[+ get_mpqk() +]
+
+// tidX = (tid & 7) << 2
+// tidY = tid >> 3
+--:-:-:-:1      LOP.AND tidX, tid,  7;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  3;
+
+// trackF += blkF*32 + tidX + offset_K
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 5;
+
+// trackI += blkI*128 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 7;
+
+// writeFs = (32*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidX, 5;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// writeIs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidX, 7;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+// readFs  = (((tid & 16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:0      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 32) >> 1) | ((tid >> 1) & 7) << 4
+--:-:-:-:1      LOP.AND tid32, tid,  32;
+--:-:-:-:1      SHR.U32 tid32, tid32, 1;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid32;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:1:-:1  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero];
+
+--:-:2:-:1  @P1 LDG.E.128 loadI00, [trackI + 4x< 0>];
+--:-:3:-:1  @P1 LDG.E.128 loadI04, [trackI + 4x<32>];
+--:-:4:-:1  @P1 LDG.E.128 loadI08, [trackI + 4x<64>];
+--:-:5:-:1  @P1 LDG.E.128 loadI12, [trackI + 4x<96>];
+--:-:-:-:1 @!P1 LDS.U.128 loadI00, [addr_zero];
+--:-:-:-:1 @!P1 LDS.U.128 loadI04, [addr_zero];
+--:-:-:-:1 @!P1 LDS.U.128 loadI08, [addr_zero];
+--:-:6:-:2 @!P1 LDS.U.128 loadI12, [addr_zero];
+
+21:-:-:-:1      STS.128 [writeFs], loadF0;
+
+02:-:-:-:1      STS.128 [writeIs + 4x< 0>], loadI00;
+04:-:-:-:1      STS.128 [writeIs + 4x<32>], loadI04;
+08:-:-:-:1      STS.128 [writeIs + 4x<64>], loadI08;
+10:1:-:-:1      STS.128 [writeIs + 4x<96>], loadI12;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:-:-:1  @P1 LDG.E.128 loadI00, [trackI + 4x< 0>];
+--:-:3:-:1  @P1 LDG.E.128 loadI04, [trackI + 4x<32>];
+--:-:-:-:1  @P1 LDG.E.128 loadI08, [trackI + 4x<64>];
+--:5:4:-:1  @P1 LDG.E.128 loadI12, [trackI + 4x<96>];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c40 => "02:-:-:-:1  \@P0 STS.128 [writeFs], loadF0;\n",
+
+        j1c62 => "--:-:2:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "02:-:-:-:1  \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF];\n",
+
+        j3c8  => "04:-:-:-:1  \@P0 STS.128 [writeIs + 4x< 0>], loadI00;\n",
+        j3c10 => "--:3:-:-:1  \@P0 STS.128 [writeIs + 4x<32>], loadI04;\n",
+
+        j3c55 => "10:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j3c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j4c8  => "04:-:-:-:1  \@P1 LDG.E.128 loadI00, [trackI + 4x< 0>];\n",
+        j4c10 => "--:-:3:-:1  \@P1 LDG.E.128 loadI04, [trackI + 4x<32>];\n",
+
+        j6c8  => "08:-:-:-:1  \@P0 STS.128 [writeIs + 4x<64>], loadI08;\n",
+        j6c10 => "--:4:-:-:1  \@P0 STS.128 [writeIs + 4x<96>], loadI12;\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "08:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c8  => "--:-:-:-:1  \@P1 LDG.E.128 loadI08, [trackI + 4x<64>];\n",
+        j7c10 => "--:5:4:-:1  \@P1 LDG.E.128 loadI12, [trackI + 4x<96>];\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2 + (tid & 32) << 1
+// tidOY = (tid & 31) >> 3
+--:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      LOP.AND tidOX2, tid,    32;
+--:-:-:-:1      SHL     tidOX,  tidOX,  2;
+--:-:-:-:1      ISCADD  tidOX,  tidOX2, tidOX, 1;
+--:-:-:-:1      LOP.AND tidOY,  tid,    31;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+--:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*32 + tidOY * 4
+--:-:-:-:1      SHL       tidOY, tidOY, 2;
+--:-:-:-:1      ISCADD k, idx_K, tidOY, 5;
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X64_N128.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X64_N128.sass
new file mode 100644
index 0000000..b782b8a
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/sconv_xprop_X64_N128.sass
@@ -0,0 +1,253 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 128;
+    our $shareF = 64;
+    our $stepI  = 64;
+    our $stepF  = 32;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 128*8*2 + 0>
+    szShareF  : (64*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<64*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-111 ~ tid1, tid64, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-111 : loadI<0-7>,  loadF<0-3>
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    108-111 ~ offsetF, offsetIc, offsetFc
+
+    114-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI
+    125-127 ~ readFs, readIs, swapBuf
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-124  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidX = (tid & 15) << 2
+// tidY = tid >> 4
+--:-:-:-:1      LOP.AND tidX, tid,  15;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  4;
+
+// trackF += blkF*64 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 6;
+
+// trackI += blkI*128 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 7;
+
+// writeFs = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidX, 6;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// writeIs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidX, 7;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs = ((tid & 48) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    48;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 64) >> 3) | ((tid >> 1) & 7)
+--:-:-:-:1      LOP.AND tid64,  tid,    64;
+--:-:-:-:1      SHR.U32 tid64,  tid64,  3;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid64;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:5:-:1 @!P1 LDS.U.128    loadF0, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128 loadI0, [trackI + 4x<00>];
+--:-:4:-:1  @P1 LDG.E.128 loadI4, [trackI + 4x<64>];
+--:-:-:-:1 @!P1 LDS.U.128 loadI0, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 loadI4, [addr_zero];
+
+12:-:-:-:1      STS.128 [writeFs], loadF0;
+
+24:-:-:-:1      STS.128 [writeIs + 4x<00>], loadI0;
+08:1:-:-:1      STS.128 [writeIs + 4x<64>], loadI4;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:3:-:1  @P1 LDG.E.128 loadI0, [trackI + 4x<00>];
+--:5:4:-:1  @P1 LDG.E.128 loadI4, [trackI + 4x<64>];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c40 => "02:2:-:-:1  \@P0 STS.128 [writeFs], loadF0;\n",
+
+        j2c10 => "02:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "20:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF];\n",
+
+        j3c8  => "04:3:-:-:1  \@P0 STS.128 [writeIs + 4x<00>], loadI0;\n",
+
+        j3c55 => "10:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j3c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j4c8  => "04:-:3:-:1  \@P1 LDG.E.128 loadI0, [trackI + 4x<00>];\n",
+
+        j6c8  => "08:4:-:-:1  \@P0 STS.128 [writeIs + 4x<64>], loadI4;\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "08:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c8  => "--:5:4:-:1  \@P1 LDG.E.128 loadI4, [trackI + 4x<64>];\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2 + (tid & 64) >> 1
+// tidOY = (tid & 63) >> 3
+02:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      SHL     tidOX,  tidOX,  2;
+--:-:-:-:1      LOP.AND tidOX2, tid,    64;
+--:-:-:-:1      SHR.U32 tidOX2, tidOX2, 1;
+--:-:-:-:1      LOP.OR  tidOX,  tidOX,  tidOX2;
+--:-:-:-:1      LOP.AND tidOY,  tid,    63;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*64 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 6;
+
+[+ output_setup(31, 1, 5) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Maxwell/sconv_xprop_X64_N64.sass b/Kernel/Convolution/Maxwell/sconv_xprop_X64_N64.sass
new file mode 100644
index 0000000..b42fbea
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/sconv_xprop_X64_N64.sass
@@ -0,0 +1,240 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 64;
+    our $shareF = 64;
+    our $stepI  = 32;
+    our $stepF  = 32;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 64*8*2 + 0>
+    szShareF  : (64*8)
+    szShareI  : (64*8)
+
+    addr_zero  : 4x<64*8*2 + 64*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 64*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 64*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 64*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 64*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 64*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 64*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 64*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-66 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-113 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-115 : loadI<0-7>,  loadF<0-7>
+
+    108-113 ~ offsetF, offsetIc, offsetFc
+    114-115 : sliceI, sliceF
+    114-115 : sliceIF<0-1>
+
+    116-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI
+    126-127 ~ readFs, readIs
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-125  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+
+[+ get_mpqk() +]
+
+// tidX = (tid & 7) << 2
+// tidY = tid >> 3
+--:-:-:-:1      LOP.AND tidX, tid,  7;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  3;
+
+// trackF += blkF*64 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 6;
+
+// trackI += blkI*64 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 6;
+
+// writeS = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 6;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+// readFs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:0      SHL     readFs, readFs, 4;
+
+// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:1:-:1  @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>];
+--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero];
+--:-:5:-:2 @!P1 LDS.U.128 loadF4, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128 loadI0, [trackI + 4x< 0>];
+--:-:4:-:1  @P1 LDG.E.128 loadI4, [trackI + 4x<32>];
+--:-:-:-:1 @!P1 LDS.U.128 loadI0, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 loadI4, [addr_zero];
+
+11:-:-:-:1      STS.128 [writeS + 4x<0*64 +  0>], loadF0;
+02:-:-:-:1      STS.128 [writeS + 4x<0*64 + 32>], loadF4;
+
+24:-:-:-:1      STS.128 [writeS + 4x<8*64 +  0>], loadI0;
+08:1:-:-:1      STS.128 [writeS + 4x<8*64 + 32>], loadI4;
+
+[+ loop_setup() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];
+--:-:3:-:1  @P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>];
+--:-:4:-:1  @P1 LDG.E.128 loadI0, [trackI + 4x< 0>];
+--:-:5:-:1  @P1 LDG.E.128 loadI4, [trackI + 4x<32>];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c37 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 +  0>], loadF0;\n",
+        j1c39 => "04:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 + 32>], loadF4;\n",
+
+        j1c62 => "02:-:2:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "02:-:-:-:1  \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];\n",
+        j2c42 => "--:-:3:-:1  \@P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>];\n",
+
+        j6c8  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 +  0>], loadI0;\n",
+        j6c10 => "10:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 + 32>], loadI4;\n",
+
+        j6c55 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j6c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j6c62 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "08:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n",
+
+        j7c8  => "--:-:4:-:1  \@P1 LDG.E.128 loadI0, [trackI + 4x< 0>];\n",
+        j7c10 => "--:-:5:-:1  \@P1 LDG.E.128 loadI4, [trackI + 4x<32>];\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2
+// tidOY = tid >> 3
+02:-:-:-:1      LOP.AND tidOX, tid,   7;
+--:-:-:-:1      SHL     tidOX, tidOX, 2;
+--:-:-:-:1      SHR.U32 tidOY, tid,   3;
+
+--:-:-:-:1      LOP.AND readIs, readIs, 0x7ff;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x7ff;
+
+// Div by 4 here collapses k stride
+// writeCs = (readKs / 4) * 64 + readNs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs  = 4 * (tidOX + (tidOY * 64))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*64 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 6;
+
+[+ output_setup(63, 0, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Maxwell/xconv_direct_updat_64x32.sass b/Kernel/Convolution/Maxwell/xconv_direct_updat_64x32.sass
new file mode 100644
index 0000000..803487e
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_direct_updat_64x32.sass
@@ -0,0 +1,1077 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our ($type, $SN, $D);
+our $determ = $D;
+our $largeN = !$SN;
+our $dtype        = $type eq 'h' ?        '.U16' : '';
+our $convert_in   = $type eq 'h' ? 'F2F.F32.F16' : '';
+our $convert_out  = $type eq 'h' ? 'F2F.F16.F32' : '';
+our $vec_size     = $type eq 'h' ?          '64' : '128';
+our $dtype_shift  = $type eq 'h' ?           '1' : '2';
+our $dtype_size   = $type eq 'h' ?           '2' : '4';
+sub dtype       { return $dtype;       }
+sub dtype_shift { return $dtype_shift; }
+sub vec_size    { return $vec_size; }
+sub output_op   { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero  : 4x<(32 + 64)*33*2>
+    szShareI   : (64*33)
+    szShareE   : (32*33)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_C            : c[0x0][0x15c]
+    param_D            : c[0x0][0x160]
+    param_H            : c[0x0][0x164]
+    param_W            : c[0x0][0x168]
+    param_N            : c[0x0][0x16c]
+    param_K            : c[0x0][0x170]
+    param_M            : c[0x0][0x174]
+    param_P            : c[0x0][0x178]
+    param_Q            : c[0x0][0x17c]
+    param_str_d        : c[0x0][0x180]
+    param_str_h        : c[0x0][0x184]
+    param_str_w        : c[0x0][0x188]
+    param_pad_d        : c[0x0][0x18c]
+    param_pad_h        : c[0x0][0x190]
+    param_pad_w        : c[0x0][0x194]
+    param_dil_d        : c[0x0][0x198]
+    param_dil_h        : c[0x0][0x19c]
+    param_dil_w        : c[0x0][0x1a0]
+    param_DHWN         : c[0x0][0x1a4]
+    param_HWN          : c[0x0][0x1a8]
+    param_WN           : c[0x0][0x1ac]
+    param_MPQN16p      : c[0x0][0x1b0]
+    param_MPQN         : c[0x0][0x1b4]
+    param_PQN          : c[0x0][0x1b8]
+    param_QN           : c[0x0][0x1bc]
+    param_PQkc         : c[0x0][0x1c0]
+    param_Qkc          : c[0x0][0x1c4]
+    param_kc           : c[0x0][0x1c8]
+    param_c            : c[0x0][0x1cc]
+    param_k            : c[0x0][0x1d0]
+    param_magic_PQkc   : c[0x0][0x1d4]
+    param_shift_PQkc   : c[0x0][0x1d8]
+    param_magic_Qkc    : c[0x0][0x1dc]
+    param_shift_Qkc    : c[0x0][0x1e0]
+    param_magic_kc     : c[0x0][0x1e4]
+    param_shift_kc     : c[0x0][0x1e8]
+    param_magic_c      : c[0x0][0x1ec]
+    param_shift_c      : c[0x0][0x1f0]
+    param_CTRSK        : c[0x0][0x1f4]
+    param_CTRS         : c[0x0][0x1f8]
+    param_TRS          : c[0x0][0x1fc]
+    param_RS           : c[0x0][0x200]
+    param_S            : c[0x0][0x204]
+    param_magic_TRS    : c[0x0][0x208]
+    param_shift_TRS    : c[0x0][0x20c]
+    param_magic_RS     : c[0x0][0x210]
+    param_shift_RS     : c[0x0][0x214]
+    param_magic_S      : c[0x0][0x218]
+    param_shift_S      : c[0x0][0x21c]
+    param_superM       : c[0x0][0x220]
+    param_superP       : c[0x0][0x224]
+    param_superQ       : c[0x0][0x228]
+    param_superN       : c[0x0][0x22c]
+    param_shiftM       : c[0x0][0x230]
+    param_shiftP       : c[0x0][0x234]
+    param_shiftQ       : c[0x0][0x238]
+    param_strideP      : c[0x0][0x23c]
+    param_strideQ      : c[0x0][0x240]
+    param_stridePQ     : c[0x0][0x244]
+    param_gridP        : c[0x0][0x248]
+    param_gridQ        : c[0x0][0x24c]
+    param_loopX        : c[0x0][0x250]
+    param_loopXp       : c[0x0][0x254]
+    param_loopQ        : c[0x0][0x258]
+    param_loopQp       : c[0x0][0x25c]
+    param_loopN        : c[0x0][0x260]
+    param_loopNp       : c[0x0][0x264]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+
+      64-79 : j0Ex<0-7>, j0Iy<0-7>
+      80-95 : j1Ex<0-7>, j1Iy<0-7>
+
+     96-119 : I0<0-3>, I1<0-3>, I2<0-3>, I3<0-3>, E0<0-3>, E1<0-3>
+    120-131 : track0I<0-1>,  track1I<0-1>, track2I<0-1>,  track3I<0-1>, track0E<0-1>, track1E<0-1>
+
+     64-131 ~ tid, idx_MPQkc, idx_PQkc, idx_Qkc, idx_kc, idx_k, idx_c, magic_PQkc, magic_Qkc, neg_PQkc, neg_Qkc, neg_kc, neg_c, div1, div2, div3, tidX, tidX4, tidY, tid1, readEs2, tid32, tid32_2, neg_TRS, neg_RS, neg_S, super_m, m, mt, k, k16, ctrs<0-3>, trs<0-3>, rs<0-3>, c<0-3>, t<0-3>, z<0-3>
+
+      80-81 : super_p, super_q
+      80-81 : pr, qs
+      82-95 ~ p, te, pIn, qIn, predEt, ti<0-3>, y<0-3>
+      80-95 ~ loopN, N
+
+    132-167 ~ tid7, q, n, idx_K, idx_C, idx_M, idx_P, start_P, idx_Q, start_Q, writeIs, writeEs, readIs, readEs, swapBuf, writeFs, predI, predE, init, x<0-3>, czOffset<0-3>, r<0-3>, s<0-3>, kmOffset
+
+     96-103 : track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+    104-119 ~ f00_<0-3>, f04_<0-3>, f08_<0-3>, f12_<0-3>
+    104-119 ~ Tid, tid_31, tid_32, K, K16, tf, idx_MPQ, xmad_determ
+    120-131 ~ alpha, readFs, K1, kk, crst<00|04|08|12>
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,       SR_TID.X;
+--:-:2:-:1      S2R idx_MPQkc, SR_CTAID.X;
+--:-:3:-:1      S2R idx_C,     SR_CTAID.Y;
+--:-:4:-:1      S2R idx_K,     SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// idx_M = idx_MPQkc / blk_PQkc
+--:-:-:-:1      MOV  magic_PQkc, param_magic_PQkc;
+--:-:-:-:1      ISETP.NE.AND P0, PT,   magic_PQkc, 1, PT;
+02:-:-:-:1  @P0 XMAD     div1, idx_MPQkc,    magic_PQkc,    RZ;
+--:-:-:-:1  @P0 XMAD     div2, idx_MPQkc,    magic_PQkc.H1, RZ;
+--:-:-:-:1  @P0 XMAD     div3, idx_MPQkc.H1, magic_PQkc.H1, RZ;
+--:-:-:-:1  @P0 XMAD.CHI div1, idx_MPQkc.H1, magic_PQkc,    div1;
+--:-:-:-:1  @P0 IADD3.RS idx_M, div1, div2, div3;
+--:-:-:-:1  @P0 SHR.U32  idx_M, idx_M,     param_shift_PQkc;
+--:-:-:-:1 @!P0 SHR.U32  idx_M, idx_MPQkc, param_shift_PQkc;
+
+// idx_PQkc = idx_PQkc % blk_Qkc
+--:-:-:-:1      IADD neg_PQkc, RZ, -param_PQkc;
+--:-:-:-:1      XMAD.LO2 idx_PQkc, neg_PQkc, idx_M, idx_MPQkc;
+
+// idx_P = idx_PQkc / blk_Qkc
+--:-:-:-:1      MOV  magic_Qkc, param_magic_Qkc;
+--:-:-:-:1      ISETP.NE.AND P1, PT,  magic_Qkc, 1, PT;
+--:-:-:-:1  @P1 XMAD     div1, idx_PQkc,    magic_Qkc,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, idx_PQkc,    magic_Qkc.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, idx_PQkc.H1, magic_Qkc.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, idx_PQkc.H1, magic_Qkc,    div1;
+--:-:-:-:1  @P1 IADD3.RS idx_P, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  idx_P, idx_P,    param_shift_Qkc;
+--:-:-:-:1 @!P1 SHR.U32  idx_P, idx_PQkc, param_shift_Qkc;
+
+// idx_Qkc = idx_PQkc % blk_Qkc
+--:-:-:-:1      IADD neg_Qkc, RZ, -param_Qkc;
+--:-:-:-:1      XMAD.LO2 idx_Qkc, neg_Qkc, idx_P, idx_PQkc;
+
+// idx_Q  = idx_Qkc / kc
+--:-:-:-:1      XMAD.LO2C idx_Q, idx_Qkc, param_magic_kc, RZ;
+--:-:-:-:1      SHR.U32   idx_Q, idx_Q,   param_shift_kc;
+// idx_kc = idx_Qkc % kc
+--:-:-:-:1      IADD neg_kc, RZ, -param_kc;
+--:-:-:-:1      XMAD.S16.U16  idx_kc, neg_kc, idx_Q, idx_Qkc;
+
+// idx_k = idx_kc / c
+--:-:-:-:1      XMAD    idx_k,  idx_kc, param_magic_c, RZ;
+--:-:-:-:1      SHR.U32 idx_k,  idx_k,  param_shift_c;
+// idx_c = idx_kc % c
+--:-:-:-:1      IADD neg_c, RZ, -param_c;
+--:-:-:-:1      XMAD.S16.U16 idx_c, neg_c, idx_k, idx_kc;
+
+// idx_C = idx_C * blk_c + idx_c
+// idx_K = idx_K * blk_k + idx_k
+04:-:-:-:1      XMAD idx_C, idx_C, param_c, idx_c;
+08:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+--:-:-:-:1      MOV start_P, idx_P;
+--:-:-:-:1      MOV start_Q, idx_Q;
+
+// tidX   = tid >> 3
+// tidY   = (tid & 7) << 2
+// shiftX = tidY
+01:-:-:-:1      SHR.U32 tidX,   tid,  3;
+--:-:-:-:1      LOP.AND tid7,   tid,  7;
+--:-:-:-:1      SHL     tidY,   tid7, 2;
+
+// writeIs = (tidY*64 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeIs, tidY, tidX, 6;
+--:-:-:-:1      IADD   writeIs, writeIs, tidY;
+--:-:-:-:1      SHL    writeIs, writeIs, 2;
+
+// writeEs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeEs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeEs, writeEs, tidY;
+--:-:-:-:1      ISCADD writeEs, writeEs, 4x<szShareI>, 2;
+
+// readEs  = (((tid >> 1) & 3) << 4
+--:-:-:-:1      BFE.U32 readEs, tid, 0x201; // 2 bits at position 1
+
+// readIs = (((tid & 24) >> 2) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readIs, tid,   24;
+--:-:-:-:1      SHR.U32 readIs, readIs, 2;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+
+// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5)
+// tid32 = tid & -32
+--:-:-:-:1      LOP.AND tid32, tid, -32;
+
+// readEs2 = readEs + (tid32 >> 2) + (readIs << 2)
+--:-:-:-:1      SHR.U32 tid32_2, tid32, 2;
+--:-:-:-:1      IADD    readEs2, tid32_2, readEs;
+--:-:-:-:1      ISCADD  readEs2, readIs, readEs2, 2;
+
+--:-:-:-:1      SHL readIs,  readIs,  4;
+--:-:-:-:1      SHL readEs,  readEs,  4;
+--:-:-:-:1      SHL readEs2, readEs2, 4;
+
+// writeFs = readIs*32*4 + readEs2
+--:-:-:-:1      ISCADD writeFs, readIs, readEs2, 7;
+
+// Each block of 32 threads works on 8 lines,
+// Also shift over each 8 lines by 8 (cumulative)
+// readIs += tid32/4 * 64 * 4 + tid32/4 * 4
+// readEs += tid32/4 * 32 * 4 + tid32/4 * 4 + 4x<szShareI>
+--:-:-:-:1      ISCADD readIs, tid32,  readIs, 6;
+--:-:-:-:1      ISCADD readEs, tid32,  readEs, 5;
+--:-:-:-:1      IADD   readIs, readIs, tid32;
+--:-:-:-:1      IADD3  readEs, readEs, 4x<szShareI>, tid32;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareI + szShareE>;
+
+// Remap ctrs for better L1 cache performance with small N
+// Maximize the amount of overlapping data requested within a warp.
+// The L1 is partitioned in to 2 groups of 2 warps.
+// ctrs = idx_C*64 + tidX*4
+--:-:-:-:1      SHL    tidX4, tidX,  2;
+--:-:-:-:1      ISCADD ctrs0, idx_C, tidX4, 6;
+--:-:-:-:1      IADD   ctrs1, ctrs0, 1;
+--:-:-:-:1      IADD   ctrs2, ctrs0, 2;
+--:-:-:-:1      IADD   ctrs3, ctrs0, 3;
+
+// c   = ctrs / RST
+--:-:-:-:1      XMAD.LO2C c0, ctrs0, param_magic_TRS, RZ;
+--:-:-:-:1      XMAD.LO2C c1, ctrs1, param_magic_TRS, RZ;
+--:-:-:-:1      XMAD.LO2C c2, ctrs2, param_magic_TRS, RZ;
+--:-:-:-:1      XMAD.LO2C c3, ctrs3, param_magic_TRS, RZ;
+--:-:-:-:1      SHR.U32   c0,    c0, param_shift_TRS;
+--:-:-:-:1      SHR.U32   c1,    c1, param_shift_TRS;
+--:-:-:-:1      SHR.U32   c2,    c2, param_shift_TRS;
+--:-:-:-:1      SHR.U32   c3,    c3, param_shift_TRS;
+// trs = ctrs % RST
+--:-:-:-:1      IADD neg_TRS, RZ, -param_TRS;
+--:-:-:-:1      XMAD.S16.U16 trs0, neg_TRS, c0, ctrs0;
+--:-:-:-:1      XMAD.S16.U16 trs1, neg_TRS, c1, ctrs1;
+--:-:-:-:1      XMAD.S16.U16 trs2, neg_TRS, c2, ctrs2;
+--:-:-:-:1      XMAD.S16.U16 trs3, neg_TRS, c3, ctrs3;
+
+// t =  trs / RS
+--:-:-:-:1      XMAD    t0, trs0, param_magic_RS, RZ;
+--:-:-:-:1      XMAD    t1, trs1, param_magic_RS, RZ;
+--:-:-:-:1      XMAD    t2, trs2, param_magic_RS, RZ;
+--:-:-:-:1      XMAD    t3, trs3, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32 t0,   t0, param_shift_RS;
+--:-:-:-:1      SHR.U32 t1,   t1, param_shift_RS;
+--:-:-:-:1      SHR.U32 t2,   t2, param_shift_RS;
+--:-:-:-:1      SHR.U32 t3,   t3, param_shift_RS;
+// rs = trs % RS
+--:-:-:-:1      IADD neg_RS, RZ, -param_RS;
+--:-:-:-:1      XMAD.S16.U16 rs0, neg_RS, t0, trs0;
+--:-:-:-:1      XMAD.S16.U16 rs1, neg_RS, t1, trs1;
+--:-:-:-:1      XMAD.S16.U16 rs2, neg_RS, t2, trs2;
+--:-:-:-:1      XMAD.S16.U16 rs3, neg_RS, t3, trs3;
+
+// r = rs / S
+--:-:-:-:1      XMAD    r0, rs0, param_magic_S, RZ;
+--:-:-:-:1      XMAD    r1, rs1, param_magic_S, RZ;
+--:-:-:-:1      XMAD    r2, rs2, param_magic_S, RZ;
+--:-:-:-:1      XMAD    r3, rs3, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32 r0,  r0, param_shift_S;
+--:-:-:-:1      SHR.U32 r1,  r1, param_shift_S;
+--:-:-:-:1      SHR.U32 r2,  r2, param_shift_S;
+--:-:-:-:1      SHR.U32 r3,  r3, param_shift_S;
+// s = rs % S
+--:-:-:-:1      IADD neg_S, RZ, -param_S;
+--:-:-:-:1      XMAD.S16.U16 s0, neg_S, r0, rs0;
+--:-:-:-:1      XMAD.S16.U16 s1, neg_S, r1, rs1;
+--:-:-:-:1      XMAD.S16.U16 s2, neg_S, r2, rs2;
+--:-:-:-:1      XMAD.S16.U16 s3, neg_S, r3, rs3;
+
+--:-:-:-:1      LOP.AND n, tid, param_superN;
+--:-:-:-:1      SHL n, n, 2;
+
+// M,C,K are static coords so compute offsets and predicates once
+--:-:-:-:1      SHL m, idx_M, param_shiftM;
+--:-:-:-:1      BFE.U32 super_m, tid7, param_superM;
+--:-:-:-:1      IADD m, m, super_m;
+
+// z = m * str_d - pad_d + (t * dil_d)
+--:-:-:-:1      XMAD  mt, m,   param_str_d, RZ;
+
+--:-:-:-:1      XMAD  z0, t0,  param_dil_d, mt;
+--:-:-:-:1      XMAD  z1, t1,  param_dil_d, mt;
+--:-:-:-:1      XMAD  z2, t2,  param_dil_d, mt;
+--:-:-:-:1      XMAD  z3, t3,  param_dil_d, mt;
+--:-:-:-:1      IADD  z0, z0, -param_pad_d;
+--:-:-:-:1      IADD  z1, z1, -param_pad_d;
+--:-:-:-:1      IADD  z2, z2, -param_pad_d;
+--:-:-:-:1      IADD  z3, z3, -param_pad_d;
+
+// czOffset = c*DHWN + z*HWN
+--:-:-:-:1      XMAD.LO2C czOffset0, c0, param_DHWN, RZ;
+--:-:-:-:1      XMAD.LO2C czOffset1, c1, param_DHWN, RZ;
+--:-:-:-:1      XMAD.LO2C czOffset2, c2, param_DHWN, RZ;
+--:-:-:-:1      XMAD.LO2C czOffset3, c3, param_DHWN, RZ;
+--:-:-:-:1      XMAD.S16.U16.LO2C czOffset0, z0, param_HWN,  czOffset0;
+--:-:-:-:1      XMAD.S16.U16.LO2C czOffset1, z1, param_HWN,  czOffset1;
+--:-:-:-:1      XMAD.S16.U16.LO2C czOffset2, z2, param_HWN,  czOffset2;
+--:-:-:-:1      XMAD.S16.U16.LO2C czOffset3, z3, param_HWN,  czOffset3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, c0, param_C, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, c1, param_C, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, c2, param_C, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, c3, param_C, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, z0, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_D, P1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_D, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_D, P3;
+--:-:-:-:1      ISETP.GE.AND P0, PT, z0, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+--:-:-:-:1      P2R predI, PR, RZ, 0x0f;
+--:-:-:-:1      SHL predI, predI, 8;
+
+// k = idx_K*32 + tidX
+--:-:-:-:1      ISCADD k, idx_K, tidX, 5;
+
+// kmOffset = k*MPQN + m*PQN
+--:-:-:-:1      XMAD.LO2C kmOffset, k, param_MPQN, RZ;
+--:-:-:-:1      XMAD.LO2C kmOffset, m, param_PQN,  kmOffset;
+
+--:-:-:-:1      IADD k16, k, 16;
+--:-:-:-:1      ISETP.LT.AND P4, PT, m,   param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, k,   param_K, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k16, param_K, P4;
+--:-:-:-:1      P2R predE, PR, RZ, 0x03;
+--:-:-:-:1      SHL predE, predE, 2;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      CAL CALC_OFFSETS;
+--:-:-:-:5      CAL DO_LOADS;
+--:-:-:-:5      CAL CALC_OFFSETS;
+
+[+
+    our $convert_in;
+    return $convert_in ? qq{
+02:-:-:-:1      $convert_in I03, I01.H1;
+--:-:-:-:1      $convert_in I02, I01.H0;
+--:-:-:-:1      $convert_in I01, I00.H1;
+--:-:-:-:1      $convert_in I00, I00.H0;
+
+--:-:-:-:1      $convert_in I13, I11.H1;
+--:-:-:-:1      $convert_in I12, I11.H0;
+--:-:-:-:1      $convert_in I11, I10.H1;
+--:-:2:-:1      $convert_in I10, I10.H0;
+
+04:-:-:-:1      $convert_in I23, I21.H1;
+--:-:-:-:1      $convert_in I22, I21.H0;
+--:-:-:-:1      $convert_in I21, I20.H1;
+--:-:-:-:1      $convert_in I20, I20.H0;
+
+--:-:-:-:1      $convert_in I33, I31.H1;
+--:-:-:-:1      $convert_in I32, I31.H0;
+--:-:-:-:1      $convert_in I31, I30.H1;
+--:-:3:-:1      $convert_in I30, I30.H0;
+
+08:-:-:-:1      $convert_in E03, E01.H1;
+--:-:-:-:1      $convert_in E02, E01.H0;
+--:-:-:-:1      $convert_in E01, E00.H1;
+--:-:4:-:1      $convert_in E00, E00.H0;
+
+10:-:-:-:1      $convert_in E13, E11.H1;
+--:-:-:-:1      $convert_in E12, E11.H0;
+--:-:-:-:1      $convert_in E11, E10.H1;
+--:-:5:-:1      $convert_in E10, E10.H0;
+        } : '';
++]
+
+02:-:-:-:1      STS [writeIs + 4x<0*64 + 0*16>], I00;
+--:-:-:-:1      STS [writeIs + 4x<1*64 + 0*16>], I01;
+--:-:-:-:1      STS [writeIs + 4x<2*64 + 0*16>], I02;
+--:-:-:-:1      STS [writeIs + 4x<3*64 + 0*16>], I03;
+
+--:-:-:-:1      STS [writeIs + 4x<0*64 + 1*16>], I10;
+--:-:-:-:1      STS [writeIs + 4x<1*64 + 1*16>], I11;
+--:-:-:-:1      STS [writeIs + 4x<2*64 + 1*16>], I12;
+--:-:-:-:1      STS [writeIs + 4x<3*64 + 1*16>], I13;
+
+04:-:-:-:1      STS [writeIs + 4x<0*64 + 2*16>], I20;
+--:-:-:-:1      STS [writeIs + 4x<1*64 + 2*16>], I21;
+--:-:-:-:1      STS [writeIs + 4x<2*64 + 2*16>], I22;
+--:-:-:-:1      STS [writeIs + 4x<3*64 + 2*16>], I23;
+
+--:-:-:-:1      STS [writeIs + 4x<0*64 + 3*16>], I30;
+--:-:-:-:1      STS [writeIs + 4x<1*64 + 3*16>], I31;
+--:-:-:-:1      STS [writeIs + 4x<2*64 + 3*16>], I32;
+--:-:-:-:1      STS [writeIs + 4x<3*64 + 3*16>], I33;
+
+08:-:-:-:1      STS [writeEs + 4x<0*32 + 0*16>], E00;
+--:-:-:-:1      STS [writeEs + 4x<1*32 + 0*16>], E01;
+--:-:-:-:1      STS [writeEs + 4x<2*32 + 0*16>], E02;
+--:-:-:-:1      STS [writeEs + 4x<3*32 + 0*16>], E03;
+
+10:-:-:-:1      STS [writeEs + 4x<0*32 + 1*16>], E10;
+--:-:-:-:1      STS [writeEs + 4x<1*32 + 1*16>], E11;
+--:-:-:-:1      STS [writeEs + 4x<2*32 + 1*16>], E12;
+--:-:-:-:1      STS [writeEs + 4x<3*32 + 1*16>], E13;
+
+// init = bNextY ? 1 : 0
+--:-:-:-:0      SEL init, RZ, 1, !P6;
+
+--:-:-:-:5      BAR.SYNC 0;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy4, [readIs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*32 + 16>];
+</SCHEDULE_BLOCK>
+--:-:-:-:5      CAL DO_LOADS;
+
+// init += bNextY ? 1 : 0
+--:-:-:-:0  @P6 IADD init, init, 1;
+
+--:-:-:-:5      CAL CALC_OFFSETS;
+--:-:-:-:5      BRA.U MAIN_LOOP;
+
+DO_LOADS:
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+--:-:-:-:1      R2P PR, predI, 0x0f;
+--:-:2:-:1  @P0 LDG.E.CI.[+ vec_size() +] I0, [track0I];
+--:-:2:-:1  @P1 LDG.E.CI.[+ vec_size() +] I1, [track1I];
+--:-:3:-:1  @P2 LDG.E.CI.[+ vec_size() +] I2, [track2I];
+--:-:3:-:1  @P3 LDG.E.CI.[+ vec_size() +] I3, [track3I];
+--:-:-:-:1 @!P0 LDS.U.[+ vec_size() +]    I0, [addr_zero];
+--:-:-:-:1 @!P1 LDS.U.[+ vec_size() +]    I1, [addr_zero];
+--:-:-:-:1 @!P2 LDS.U.[+ vec_size() +]    I2, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.[+ vec_size() +]    I3, [addr_zero];
+
+--:-:-:-:1      R2P PR, predE, 0x03;
+--:-:4:-:1  @P0 LDG.E.CI.[+ vec_size() +] E0, [track0E];
+--:6:5:-:1  @P1 LDG.E.CI.[+ vec_size() +] E1, [track1E];
+--:-:-:-:1 @!P0 LDS.U.[+ vec_size() +]    E0, [addr_zero];
+--:-:2:-:1 @!P1 LDS.U.[+ vec_size() +]    E1, [addr_zero];
+</ORDERED>
+
+// Advance offset/preds
+--:-:-:-:1      IADD n, n, param_loopN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+--:-:-:-:1 @!P4 LOP.AND n, tid7, param_superN;
+--:-:-:-:1 @!P4 SHL n, n, 2;
+--:-:-:-:1 @!P4 IADD idx_Q, idx_Q, param_strideQ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, idx_Q, param_gridQ, PT;
+
+--:-:-:-:1 @!P5 MOV  idx_Q, start_Q;
+--:-:-:-:1 @!P5 IADD idx_P, idx_P, param_strideP;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, idx_P, param_gridP, PT;
+--:-:-:-:0      ISETP.LT.AND P5, PT, idx_Q, param_gridQ, P6;
+
+--:-:-:-:1 @!P6 MOV predI, RZ;
+--:-:-:-:1 @!P6 MOV predE, RZ;
+
+</SCHEDULE_BLOCK>
+--:-:-:-:5      RET;
+
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+
+CALC_OFFSETS:
+
+<SCHEDULE_BLOCK>
+// Calc superblock coordinates in m,p,q space
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+
+// Calc this thread's offset within the superblock
+--:-:-:-:1      BFE.U32 super_p, tid7, param_superP;
+--:-:-:-:1      BFE.U32 super_q, tid7, param_superQ;
+
+// Combine offsets for final m,p,q coordinate
+--:-:-:-:1      IADD p, p, super_p;
+--:-:-:-:1      IADD q, q, super_q;
+
+// y = p * str_h - pad_h + (r * dil_h)
+// x = q * str_w - pad_w + (s * dil_w)
+--:-:-:-:1      XMAD  pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD  qs, q,   param_str_w, RZ;
+
+--:-:-:-:1      XMAD  y0, r0,  param_dil_h, pr;
+--:-:-:-:1      XMAD  y1, r1,  param_dil_h, pr;
+--:-:-:-:1      XMAD  y2, r2,  param_dil_h, pr;
+--:-:-:-:1      XMAD  y3, r3,  param_dil_h, pr;
+--:-:-:-:1      IADD  y0, y0, -param_pad_h;
+--:-:-:-:1      IADD  y1, y1, -param_pad_h;
+--:-:-:-:1      IADD  y2, y2, -param_pad_h;
+--:-:-:-:1      IADD  y3, y3, -param_pad_h;
+
+--:-:-:-:1      XMAD  x0, s0,  param_dil_w, qs;
+--:-:-:-:1      XMAD  x1, s1,  param_dil_w, qs;
+--:-:-:-:1      XMAD  x2, s2,  param_dil_w, qs;
+--:-:-:-:1      XMAD  x3, s3,  param_dil_w, qs;
+--:-:-:-:1      IADD  x0, x0, -param_pad_w;
+--:-:-:-:1      IADD  x1, x1, -param_pad_w;
+--:-:-:-:1      IADD  x2, x2, -param_pad_w;
+--:-:-:-:1      IADD  x3, x3, -param_pad_w;
+
+// trackI = c*DHWN + z*HWN + y*WN + x*N + n
+--:-:-:-:1      XMAD.S16.U16.LO2C ti0, y0, param_WN, n;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti1, y1, param_WN, n;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti2, y2, param_WN, n;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti3, y3, param_WN, n;
+--:-:-:-:1      XMAD.S16.U16 ti0, x0, param_N,  ti0;
+--:-:-:-:1      XMAD.S16.U16 ti1, x1, param_N,  ti1;
+--:-:-:-:1      XMAD.S16.U16 ti2, x2, param_N,  ti2;
+--:-:-:-:1      XMAD.S16.U16 ti3, x3, param_N,  ti3;
+--:-:-:-:1      IADD ti0, ti0, czOffset0;
+--:-:-:-:1      IADD ti1, ti1, czOffset1;
+--:-:-:-:1      IADD ti2, ti2, czOffset2;
+--:-:-:-:1      IADD ti3, ti3, czOffset3;
+
+20:-:-:-:1      LEA    track0I0.CC, ti0, param_I[0], [+ dtype_shift() +];
+--:-:-:-:1      ISET.LT.AND    ti0, ti0, RZ, PT;
+--:-:-:-:1      IADD.X track0I1,    ti0, param_I[1];
+--:-:-:-:1      LEA    track1I0.CC, ti1, param_I[0], [+ dtype_shift() +];
+--:-:-:-:1      ISET.LT.AND    ti1, ti1, RZ, PT;
+--:-:-:-:1      IADD.X track1I1,    ti1, param_I[1];
+--:-:-:-:1      LEA    track2I0.CC, ti2, param_I[0], [+ dtype_shift() +];
+--:-:-:-:1      ISET.LT.AND    ti2, ti2, RZ, PT;
+--:-:-:-:1      IADD.X track2I1,    ti2, param_I[1];
+--:-:-:-:1      LEA    track3I0.CC, ti3, param_I[0], [+ dtype_shift() +];
+--:-:-:-:1      ISET.LT.AND    ti3, ti3, RZ, PT;
+--:-:-:-:1      IADD.X track3I1,    ti3, param_I[1];
+
+--:-:-:-:1      SHR.U32 predI, predI, 8;
+--:-:-:-:1      R2P PR, predI, 0x0f;
+--:-:-:-:1      SHL     predI, predI, 4;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, y0, param_H, P0;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_H, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_H, P3;
+--:-:-:-:1      ISETP.GE.AND P0, PT, y0, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, y1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, y2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, y3, RZ, P3;
+--:-:-:-:1      P2R predI, PR, predI, 0x0f;
+--:-:-:-:1      SHL predI, predI, 4;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x0, param_W, P0;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_W, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_W, P3;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x0, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R predI, PR, predI, 0x0f;
+
+// trackE = k*MPQN + m*PQN + p*QN + n
+--:-:-:-:1      XMAD.LO2C te, p, param_QN,   n;
+--:-:-:-:1      XMAD      te, q, param_N,    te;
+--:-:-:-:1      IADD      te, te, kmOffset;
+
+--:-:-:-:1      LEA      track0E0.CC, te, param_E[0],     [+ dtype_shift() +];
+--:-:-:-:1      LEA.HI.X track0E1,    te, param_E[1], RZ, [+ dtype_shift() +];
+--:-:-:-:1      IADD     track1E0.CC, track0E0, param_MPQN16p;
+--:-:-:-:0      IADD.X   track1E1,    track0E1, RZ;
+
+--:-:-:-:1      ISET.LT.AND qIn, p, param_P, PT;
+--:-:-:-:1      ISET.LT.AND pIn, q, param_Q, PT;
+--:-:-:-:1      SHR.U32  predEt, predE, 2;
+--:-:-:-:1      LOP3.LUT predEt, predEt, pIn, qIn, 0x80;
+--:-:-:-:1      BFI predE, predEt, 0x200, predE;
+
+</SCHEDULE_BLOCK>
+--:-:-:-:5      RET;
+
+
+MAIN_LOOP:
+[+
+    our ($vec_size, $convert_in, $largeN);
+    my %insert = (
+
+        j0c8  => "--:-:-:-:1      R2P PR, predI, 0x0f;\n",
+
+        $convert_in ? (
+            j1c5  => "--:-:-:-:1      DEPBAR.LE SB1, 1;\n",
+            j1c8  => "--:-:-:-:1      $convert_in I03, I01.H1;\n",
+            j1c10 => "--:-:-:-:1      $convert_in I02, I01.H0;\n",
+            j1c12 => "--:-:-:-:1      $convert_in I01, I00.H1;\n",
+            j1c14 => "--:-:6:-:1      $convert_in I00, I00.H0;\n",
+
+            j2c5  => "--:-:-:-:1      DEPBAR.LE SB1, 1;\n",
+            j2c8  => "--:-:-:-:1      $convert_in I13, I11.H1;\n",
+            j2c10 => "--:-:-:-:1      $convert_in I12, I11.H0;\n",
+            j2c12 => "--:-:-:-:1      $convert_in I11, I10.H1;\n",
+            j2c14 => "--:-:6:-:1      $convert_in I10, I10.H0;\n",
+
+            j3c5  => "--:-:-:-:1      DEPBAR.LE SB2, 1;\n",
+            j3c8  => "--:-:-:-:1      $convert_in I23, I21.H1;\n",
+            j3c10 => "--:-:-:-:1      $convert_in I22, I21.H0;\n",
+            j3c12 => "--:-:-:-:1      $convert_in I21, I20.H1;\n",
+            j3c14 => "--:-:6:-:1      $convert_in I20, I20.H0;\n",
+
+            j4c5  => "--:-:-:-:1      DEPBAR.LE SB2, 1;\n",
+            j4c8  => "--:-:-:-:1      $convert_in I33, I31.H1;\n",
+            j4c10 => "--:-:-:-:1      $convert_in I32, I31.H0;\n",
+            j4c12 => "--:-:-:-:1      $convert_in I31, I30.H1;\n",
+            j4c14 => "--:-:6:-:1      $convert_in I30, I30.H0;\n",
+
+            j5c8  => "08:-:-:-:1      $convert_in E03, E01.H1;\n",
+            j5c10 => "--:-:-:-:1      $convert_in E02, E01.H0;\n",
+            j5c12 => "--:-:-:-:1      $convert_in E01, E00.H1;\n",
+            j5c14 => "--:-:4:-:1      $convert_in E00, E00.H0;\n",
+
+            j6c8  => "10:-:-:-:1      $convert_in E13, E11.H1;\n",
+            j6c10 => "--:-:-:-:1      $convert_in E12, E11.H0;\n",
+            j6c12 => "--:-:-:-:1      $convert_in E11, E10.H1;\n",
+            j6c14 => "--:-:5:-:1      $convert_in E10, E10.H0;\n",
+        ) : (
+            j1c27 => "--:-:-:-:1      DEPBAR.LE SB1, 1;\n",
+            j2c27 => "--:-:-:-:1      DEPBAR.LE SB1, 1;\n",
+            j2c27 => "--:-:-:-:1      DEPBAR.LE SB2, 1;\n",
+            j4c27 => "--:-:-:-:1      DEPBAR.LE SB2, 1;\n",
+        ),
+
+        j1c30 => "20:-:-:-:1      STS [writeIs + 4x<0*64 + 0*16>], I00;\n",
+        j1c32 => "--:-:-:-:1      STS [writeIs + 4x<1*64 + 0*16>], I01;\n",
+        j1c34 => "--:-:-:-:1      STS [writeIs + 4x<2*64 + 0*16>], I02;\n",
+        j1c36 => "--:6:-:-:1      STS [writeIs + 4x<3*64 + 0*16>], I03;\n",
+        j1c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];\n",
+        j1c60 => "20:-:2:-:1  \@P0 LDG.E.CI.$vec_size I0, [track0I];\n",
+
+        j2c30 => "20:-:-:-:1      STS [writeIs + 4x<0*64 + 1*16>], I10;\n",
+        j2c32 => "--:-:-:-:1      STS [writeIs + 4x<1*64 + 1*16>], I11;\n",
+        j2c34 => "--:-:-:-:1      STS [writeIs + 4x<2*64 + 1*16>], I12;\n",
+        j2c36 => "--:6:-:-:1      STS [writeIs + 4x<3*64 + 1*16>], I13;\n",
+        j2c38 => "--:-:-:-:1 \@!P1 LDS.U.$vec_size I1, [addr_zero];\n",
+        j2c60 => "20:-:2:-:1  \@P1 LDG.E.CI.$vec_size I1, [track1I];\n",
+
+        j3c30 => "20:-:-:-:1      STS [writeIs + 4x<0*64 + 2*16>], I20;\n",
+        j3c32 => "--:-:-:-:1      STS [writeIs + 4x<1*64 + 2*16>], I21;\n",
+        j3c34 => "--:-:-:-:1      STS [writeIs + 4x<2*64 + 2*16>], I22;\n",
+        j3c36 => "--:6:-:-:1      STS [writeIs + 4x<3*64 + 2*16>], I23;\n",
+        j3c38 => "--:-:-:-:1 \@!P2 LDS.U.$vec_size I2, [addr_zero];\n",
+        j3c60 => "20:-:3:-:1  \@P2 LDG.E.CI.$vec_size I2, [track2I];\n",
+
+        j4c30 => "20:-:-:-:1      STS [writeIs + 4x<0*64 + 3*16>], I30;\n",
+        j4c32 => "--:-:-:-:1      STS [writeIs + 4x<1*64 + 3*16>], I31;\n",
+        j4c34 => "--:-:-:-:1      STS [writeIs + 4x<2*64 + 3*16>], I32;\n",
+        j4c36 => "--:6:-:-:1      STS [writeIs + 4x<3*64 + 3*16>], I33;\n",
+        j4c38 => "--:-:-:-:1 \@!P3 LDS.U.$vec_size I3, [addr_zero];\n",
+        j4c60 => "20:-:3:-:1  \@P3 LDG.E.CI.$vec_size I3, [track3I];\n",
+
+        j5c7  => "--:-:-:-:1      R2P PR, predE, 0x0f;\n",
+
+        j5c30 => "08:-:-:-:1      STS [writeEs + 4x<0*32 + 0*16>], E00;\n",
+        j5c32 => "--:-:-:-:1      STS [writeEs + 4x<1*32 + 0*16>], E01;\n",
+        j5c34 => "--:-:-:-:1      STS [writeEs + 4x<2*32 + 0*16>], E02;\n",
+        j5c36 => "--:4:-:-:1      STS [writeEs + 4x<3*32 + 0*16>], E03;\n",
+        j5c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size E0, [addr_zero];\n",
+        j5c60 => "08:-:4:-:1  \@P0 LDG.E.CI.$vec_size E0, [track0E];\n",
+
+        j6c30 => "10:-:-:-:1      STS [writeEs + 4x<0*32 + 1*16>], E10;\n",
+        j6c32 => "--:-:-:-:1      STS [writeEs + 4x<1*32 + 1*16>], E11;\n",
+        j6c34 => "--:-:-:-:1      STS [writeEs + 4x<2*32 + 1*16>], E12;\n",
+        j6c36 => "--:5:-:-:1      STS [writeEs + 4x<3*32 + 1*16>], E13;\n",
+        j6c38 => "--:-:-:-:1 \@!P1 LDS.U.$vec_size E1, [addr_zero];\n",
+        j6c60 => "10:6:5:-:1  \@P1 LDG.E.CI.$vec_size E1, [track1E];\n",
+
+        j6c63 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readIs,  readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readEs,  readEs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeIs, writeIs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeEs, writeEs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c15 => "--:-:-:-:1      PSETP.OR.AND P4, PT, P5, P6, PT;\n",
+        j7c17 => "--:-:-:-:1      IADD n, n, param_loopN;\n",
+        j7c27 => "--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, P4;\n",
+
+        $largeN ? (
+            j7c30 => "20:-:-:-:1      IADD   track0I0.CC, track0I0, param_loopNp;\n",
+            j7c35 => "--:-:-:-:1      IADD.X track0I1,    track0I1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track1I0.CC, track1I0, param_loopNp;\n",
+            j7c40 => "--:-:-:-:1      IADD.X track1I1,    track1I1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track2I0.CC, track2I0, param_loopNp;\n",
+            j7c45 => "--:-:-:-:1      IADD.X track2I1,    track2I1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track3I0.CC, track3I0, param_loopNp;\n",
+            j7c50 => "--:-:-:-:1      IADD.X track3I1,    track3I1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track0E0.CC, track0E0, param_loopNp;\n",
+            j7c55 => "--:-:-:-:1      IADD.X track0E1,    track0E1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track1E0.CC, track1E0, param_loopNp;\n",
+            j7c60 => "--:-:-:-:1      IADD.X track1E1,    track1E1, RZ;\n",
+        ) : (),
+
+        j7c63 => "--:-:-:Y:5  \@P4 BRA.U MAIN_LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) & 7;
+        my $shift    = ((($j + 1) & 7) >> 2) << 2;
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy0, [readIs + 4x<%d*64 + 00 + %d>];\n", $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1      LDS.U.128 j%dEx0, [readEs + 4x<%d*32 + 00 + %d>];\n", $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy4, [readIs + 4x<%d*64 + 32 + %d>];\n", $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1      LDS.U.128 j%dEx4, [readEs + 4x<%d*32 + 16 + %d>];\n", $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1;
+
+            my $yield  = $c == 25 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+// Advance x/q offsets+preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD x0, x0, param_loopX;
+--:-:-:-:1      IADD x1, x1, param_loopX;
+--:-:-:-:1      IADD x2, x2, param_loopX;
+--:-:-:-:1      IADD x3, x3, param_loopX;
+20:-:-:-:1      IADD   track0I0.CC, track0I0, param_loopXp;
+--:-:-:-:1      IADD.X track0I1,    track0I1, RZ;
+--:-:-:-:1      IADD   track1I0.CC, track1I0, param_loopXp;
+--:-:-:-:1      IADD.X track1I1,    track1I1, RZ;
+--:-:-:-:1      IADD   track2I0.CC, track2I0, param_loopXp;
+--:-:-:-:1      IADD.X track2I1,    track2I1, RZ;
+--:-:-:-:1      IADD   track3I0.CC, track3I0, param_loopXp;
+--:-:-:-:1      IADD.X track3I1,    track3I1, RZ;
+
+--:-:-:-:1      SHR.U32 predI, predI, 4;
+--:-:-:-:1  @P6 R2P PR, predI, 0x0f;
+--:-:-:-:1      SHL     predI, predI, 4;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x0, param_W, P0;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_W, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_W, P3;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x0, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R predI, PR, predI, 0x0f;
+
+--:-:-:-:1      IADD q, q, param_loopQ;
+--:-:-:-:1      ISETP.LT.AND P4, PT, q, param_Q, PT;
+--:-:-:-:1 @!P4 LOP.AND predE, predE, 0xc;
+
+--:-:-:-:1      IADD   track0E0.CC, track0E0, param_loopQp;
+--:-:-:-:1      IADD.X track0E1,    track0E1, RZ;
+--:-:-:-:1      IADD   track1E0.CC, track1E0, param_loopQp;
+
+--:-:-:-:1      IADD idx_Q, idx_Q, param_strideQ;
+--:-:-:-:1      ISETP.LT.AND P5, PT, idx_Q, param_gridQ, P6;
+
+--:-:-:-:1      LOP.AND n, tid7, param_superN;
+--:-:-:-:1      SHL n, n, 2;
+
+</SCHEDULE_BLOCK>
+--:-:-:-:0      IADD.X track1E1,    track1E1, RZ;
+--:-:-:Y:5  @P5 BRA.U MAIN_LOOP;
+
+// Advance y/p offsets+preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV  idx_Q, start_Q;
+--:-:-:-:1      IADD idx_P, idx_P, param_strideP;
+
+--:-:-:-:1      PSETP.AND.AND P5, PT, PT, PT, PT;
+--:-:-:Y:d      ISETP.LT.AND  P6, PT, idx_P, param_gridP, PT;
+</SCHEDULE_BLOCK>
+--:-:-:Y:5 @!P6 BRA.U FINISH_LOOP;
+--:-:-:-:5      CAL CALC_OFFSETS;
+--:-:-:Y:5  @P6 BRA.U MAIN_LOOP;
+
+// Set n to loop remaining times
+FINISH_LOOP:
+--:-:-:-:1      LOP.AND.NZ P5, RZ, init, 3;
+--:-:-:-:1      MOV predI, RZ;
+--:-:-:-:1      MOV predE, RZ;
+--:-:-:-:1      MOV loopN, param_loopN;
+--:-:-:Y:8      MOV N, param_N;
+--:-:-:-:1      VMAD.U16.U16 n, -init, loopN, N;
+--:-:-:-:0      MOV init, RZ;
+01:-:-:Y:5  @P5 BRA.U MAIN_LOOP;
+
+
+--:-:1:-:2      S2R Tid, SR_TID.X;
+<SCHEDULE_BLOCK>
+01:-:-:-:1      SHR.U32 tid_32, Tid, 5;
+--:-:-:-:1      LOP.AND tid_31, Tid, 31;
+
+// readFs = (tid_32 << 7 + tid_31) << 2
+--:-:-:-:1      ISCADD readFs, tid_32, tid_31, 7;
+--:-:-:-:1      SHL    readFs, readFs, 2;
+
+// kk = idx_K*32 + tid31;
+--:-:-:-:1      ISCADD kk, idx_K, tid_31, 5;
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P4, PT, kk, param_K, PT;
+
+// crst = idx_C*64 + tid_32*4
+--:-:-:-:1      SHL     tid_32, tid_32, 2;
+--:-:-:-:1      ISCADD  crst00, idx_C, tid_32, 6;
+--:-:-:-:1      IADD    crst04, crst00, 16;
+--:-:-:-:1      IADD    crst08, crst00, 32;
+--:-:-:-:1      IADD    crst12, crst00, 48;
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1,  K, 2;
+--:-:-:-:1      SHL K16, K, 6;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+// trackF += crst*K + k;
+--:-:-:-:1      XMAD.LO2 tf, crst00, K, kk;
+[+
+    our $determ;
+    return $determ ? q{
+// idx_MPQ = idx_M * grid_PQ + idx_P * grid_Q + idx_Q
+// trackF += idx_MPQ * CRSTK
+--:-:-:-:1      XMAD      idx_MPQ, start_P, param_strideQ, start_Q;
+--:-:-:-:1      XMAD.LO2C idx_MPQ, idx_M,  param_stridePQ, idx_MPQ;
+--:-:-:-:1      XMAD.LO   tf, idx_MPQ, param_CTRSK, tf, xmad_determ;
+    } : '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 2;
+--:-:-:-:1      IADD     track04F0.CC, track00F0, K16;
+--:-:-:-:1      IADD.X   track04F1,    track00F1, RZ;
+--:-:-:-:1      IADD     track08F0.CC, track04F0, K16;
+--:-:-:-:1      IADD.X   track08F1,    track04F1, RZ;
+--:-:-:-:1      IADD     track12F0.CC, track08F0, K16;
+--:-:-:-:1      IADD.X   track12F1,    track08F1, RZ;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:1      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:1      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:1      STS.128 [writeFs+4x<0*128 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeFs+4x<0*128 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeFs+4x<1*128 + 00>], shuffle_x0y1;
+--:-:-:-:1      STS.128 [writeFs+4x<1*128 + 16>], shuffle_x4y1;
+--:-:-:-:1      STS.128 [writeFs+4x<2*128 + 00>], shuffle_x0y2;
+--:-:-:-:1      STS.128 [writeFs+4x<2*128 + 16>], shuffle_x4y2;
+--:-:-:-:1      STS.128 [writeFs+4x<3*128 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeFs+4x<3*128 + 16>], shuffle_x4y3;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_F;
+--:-:-:-:0      IADD readFs, readFs, 4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_F;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:0      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:1      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:1      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:1      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:1      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:1      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:1      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:1      STS.128 [writeFs+4x<0*128 + 00>], shuffle_x0y4;
+--:-:-:-:1      STS.128 [writeFs+4x<0*128 + 16>], shuffle_x4y4;
+--:-:-:-:1      STS.128 [writeFs+4x<1*128 + 00>], shuffle_x0y5;
+--:-:-:-:1      STS.128 [writeFs+4x<1*128 + 16>], shuffle_x4y5;
+--:-:-:-:1      STS.128 [writeFs+4x<2*128 + 00>], shuffle_x0y6;
+--:-:-:-:1      STS.128 [writeFs+4x<2*128 + 16>], shuffle_x4y6;
+--:-:-:-:1      STS.128 [writeFs+4x<3*128 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeFs+4x<3*128 + 16>], shuffle_x4y7;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:0      IADD readFs, readFs, -4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_F;
+--:-:-:-:0      IADD readFs, readFs,  4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_F;
+
+--:-:-:-:5      EXIT;
+
+STORE_F:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CTRS, P4; // crst00 < CRST && k < K
+--:-:-:-:1      IADD crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CTRS, P4; // crst04 < CRST && k < K
+--:-:-:-:1      IADD crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CTRS, P4; // crst08 < CRST && k < K
+--:-:-:-:1      IADD crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CTRS, P4; // crst12 < CRST && k < K
+--:-:-:-:1      IADD crst12, crst12, 1;
+<ORDERED>
+--:-:-:-:1      LDS f00_0, [readFs + 4x< 0*128 + 0*32 + 0*16>];
+--:-:-:-:1      LDS f00_1, [readFs + 4x< 0*128 + 1*32 + 0*16>];
+--:-:-:-:1      LDS f00_2, [readFs + 4x< 0*128 + 2*32 + 0*16>];
+--:-:1:Y:1      LDS f00_3, [readFs + 4x< 0*128 + 3*32 + 0*16>];
+--:-:-:-:1      LDS f04_0, [readFs + 4x< 4*128 + 0*32 + 1*16>];
+--:-:-:-:1      LDS f04_1, [readFs + 4x< 4*128 + 1*32 + 1*16>];
+--:-:-:-:1      LDS f04_2, [readFs + 4x< 4*128 + 2*32 + 1*16>];
+--:-:2:Y:1      LDS f04_3, [readFs + 4x< 4*128 + 3*32 + 1*16>];
+--:-:-:-:1      LDS f08_0, [readFs + 4x< 8*128 + 0*32 + 2*16>];
+--:-:-:-:1      LDS f08_1, [readFs + 4x< 8*128 + 1*32 + 2*16>];
+--:-:-:-:1      LDS f08_2, [readFs + 4x< 8*128 + 2*32 + 2*16>];
+--:-:3:Y:1      LDS f08_3, [readFs + 4x< 8*128 + 3*32 + 2*16>];
+--:-:-:-:1      LDS f12_0, [readFs + 4x<12*128 + 0*32 + 3*16>];
+--:-:-:-:1      LDS f12_1, [readFs + 4x<12*128 + 1*32 + 3*16>];
+--:-:-:-:1      LDS f12_2, [readFs + 4x<12*128 + 2*32 + 3*16>];
+--:-:4:Y:1      LDS f12_3, [readFs + 4x<12*128 + 3*32 + 3*16>];
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+01:-:-:-:1      FADD f00_0, f00_0, f00_1;
+--:-:-:-:1      FADD f00_2, f00_2, f00_3;
+02:-:-:-:1      FADD f04_0, f04_0, f04_1;
+--:-:-:-:1      FADD f04_2, f04_2, f04_3;
+04:-:-:-:1      FADD f08_0, f08_0, f08_1;
+--:-:-:-:1      FADD f08_2, f08_2, f08_3;
+08:-:-:-:1      FADD f12_0, f12_0, f12_1;
+--:-:-:-:1      FADD f12_2, f12_2, f12_3;
+
+--:-:-:-:1      FADD f00_0, f00_0, f00_2;
+--:-:-:-:2      FADD f04_0, f04_0, f04_2;
+--:-:-:-:2      FADD f08_0, f08_0, f08_2;
+--:-:-:-:0      FADD f12_0, f12_0, f12_2;
+
+01:1:-:-:1  @P0 [+ output_op() +] [track00F], f00_0;
+02:2:-:-:1  @P1 [+ output_op() +] [track04F], f04_0;
+04:3:-:-:1  @P2 [+ output_op() +] [track08F], f08_0;
+08:4:-:-:1  @P3 [+ output_op() +] [track12F], f12_0;
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
\ No newline at end of file
diff --git a/Kernel/Convolution/Maxwell/xconv_direct_xprop_64x32.sass b/Kernel/Convolution/Maxwell/xconv_direct_xprop_64x32.sass
new file mode 100644
index 0000000..4720ab8
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_direct_xprop_64x32.sass
@@ -0,0 +1,2477 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our ($type, $SN, $N2, $N1);
+    our $LN = !($SN || $N2 || $N1);
+    our $dtype        = $type eq 'h' ?         'U16' : '32';
+    our $convert_in   = $type eq 'h' ? 'F2F.F32.F16' : '';
+    our $convert_out  = $type eq 'h' ? 'F2F.F16.F32' : '';
+    our $vsize        = $type eq 'h' ?          '64' : '128';
+    our $dshift       = $type eq 'h' ?           '1' : '2';
+    our $dsize        = $type eq 'h' ?           '2' : '4';
+    our $slice_scale  = $N1 ? 4 : $N2 ? 3 : 2;
+    our $slice_offset = 1 << $slice_scale;
+    our $slice_load   = 8 << $slice_scale;
+    sub dtype       { return $dtype;       }
+    sub dshift      { return $dshift; }
+    sub vsize       { return $vsize; }
+    our $vsizeI;
+    if ($type eq 'h')
+        { $vsizeI = $N1 ? 'U16' : $N2 ? '32' : '64';  }
+    else
+        { $vsizeI = $N1 ? '32'  : $N2 ? '64' : '128'; }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero  : 4x<(32 + 64)*32*2>
+    addr_szLut : 4x<(32 + 64)*32*2 + 4>
+    addr_lut4  : 4x<(32 + 64)*32*2 + 4>
+    addr_lut   : 4x<(32 + 64)*32*2 + 6>
+
+    szShareF   : (64*32)
+    szShareI   : (32*32)
+
+    param_Sum[0]       : c[0x0][0x140]
+    param_Sum[1]       : c[0x0][0x144]
+    param_X[0]         : c[0x0][0x148]
+    param_X[1]         : c[0x0][0x14c]
+    param_O[0]         : c[0x0][0x150]
+    param_O[1]         : c[0x0][0x154]
+    param_I[0]         : c[0x0][0x158]
+    param_I[1]         : c[0x0][0x15c]
+    param_F[0]         : c[0x0][0x160]
+    param_F[1]         : c[0x0][0x164]
+    param_alpha        : c[0x0][0x168]
+    param_beta         : c[0x0][0x16c]
+    param_flags        : c[0x0][0x170]
+    param_C            : c[0x0][0x174]
+    param_D            : c[0x0][0x178]
+    param_H            : c[0x0][0x17c]
+    param_W            : c[0x0][0x180]
+    param_N            : c[0x0][0x184]
+    param_K            : c[0x0][0x188]
+    param_M            : c[0x0][0x18c]
+    param_P            : c[0x0][0x190]
+    param_Q            : c[0x0][0x194]
+    param_str_d        : c[0x0][0x198]
+    param_str_h        : c[0x0][0x19c]
+    param_str_w        : c[0x0][0x1a0]
+    param_pad_d        : c[0x0][0x1a4]
+    param_pad_h        : c[0x0][0x1a8]
+    param_pad_w        : c[0x0][0x1ac]
+    param_dil_d        : c[0x0][0x1b0] 
+    param_dil_h        : c[0x0][0x1b4] 
+    param_dil_w        : c[0x0][0x1b8] 
+    param_DHWN         : c[0x0][0x1bc]
+    param_HWN          : c[0x0][0x1c0]
+    param_WN           : c[0x0][0x1c4]
+    param_MPQN         : c[0x0][0x1c8]
+    param_PQN          : c[0x0][0x1cc]
+    param_QN           : c[0x0][0x1d0]
+    param_PQnk         : c[0x0][0x1d4]
+    param_Qnk          : c[0x0][0x1d8]
+    param_nk           : c[0x0][0x1dc]
+    param_n            : c[0x0][0x1e0]
+    param_k            : c[0x0][0x1e4]
+    param_magic_PQnk   : c[0x0][0x1e8]
+    param_shift_PQnk   : c[0x0][0x1ec]
+    param_magic_Qnk    : c[0x0][0x1f0]
+    param_shift_Qnk    : c[0x0][0x1f4]
+    param_magic_nk     : c[0x0][0x1f8]
+    param_shift_nk     : c[0x0][0x1fc]
+    param_magic_k      : c[0x0][0x200]
+    param_shift_k      : c[0x0][0x204]
+    param_Km32         : c[0x0][0x208]
+    param_K32p         : c[0x0][0x20c]
+    param_TRSK         : c[0x0][0x210]
+    param_TRS          : c[0x0][0x214]
+    param_RS           : c[0x0][0x218]
+    param_S            : c[0x0][0x21c]
+    param_magic_RS     : c[0x0][0x220]
+    param_shift_RS     : c[0x0][0x224]
+    param_magic_S      : c[0x0][0x228]
+    param_shift_S      : c[0x0][0x22c]
+    param_gridP2       : c[0x0][0x230]
+    param_gridQ        : c[0x0][0x234]
+    param_gridN        : c[0x0][0x238]
+    param_gridQN       : c[0x0][0x23c]
+    param_gridPQN      : c[0x0][0x240]
+    param_gridMPQN     : c[0x0][0x244]
+    param_superM       : c[0x0][0x248]
+    param_superP       : c[0x0][0x24c]
+    param_superQ       : c[0x0][0x250]
+    param_superN       : c[0x0][0x254]
+    param_shiftM       : c[0x0][0x258]
+    param_shiftP       : c[0x0][0x25c]
+    param_shiftQ       : c[0x0][0x260]
+    param_shiftN       : c[0x0][0x264]
+    param_SuperM       : c[0x0][0x268]
+    param_SuperP       : c[0x0][0x26c]
+    param_SuperQ       : c[0x0][0x270]
+    param_SuperN       : c[0x0][0x274]
+    param_magic_str_d  : c[0x0][0x278]
+    param_shift_str_d  : c[0x0][0x27c]
+    param_magic_str_h  : c[0x0][0x280]
+    param_shift_str_h  : c[0x0][0x284]
+    param_magic_str_w  : c[0x0][0x288]
+    param_shift_str_w  : c[0x0][0x28c]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-79 : j0Fy<0-7>, j0Ix<0-7>
+      80-95 : j1Fy<0-7>, j1Ix<0-7>
+
+     96-119 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>, I0<0-3>, I1<0-3>
+    120-131 : track0F<0-1>,  track1F<0-1>, track2F<0-1>,  track3F<0-1>, track0I<0-1>, track1I<0-1>
+
+      64-83 ~ tidY, m, p, q, negOne, trs, lutStore2, lut_size, warp_count, warp_inc, neg_RS, neg_S, dep_thd_mask, qs, pr, mt, neg_str_w, neg_str_h, neg_str_d
+
+     84-131 ~ idx_MPQnk, idx_PQnk, idx_Qnk, idx_nk, idx_n, idx_k, magic_PQnk, magic_Qnk, neg_PQnk, neg_Qnk, neg_nk, neg_k, div1, div2, div3, idx_P2, idx_Q2, super_m, super_p, super_q, super_n, tid1, tid2, tid3, tid7, tid8, tid31, tid32, readIs2, tidX, k<0|1|2|3>, sb, warp_mask, mask_shr, shiftSB, maskSB, q<1|2|3>
+
+     84-131 ~ rs, t, r, s, z, y, x, x<1|2|3>, z_prime, y_prime, x_prime, x_prime<1|2|3>, z_mod, y_mod, x_mod, x_mod<1|2|3>, lutStore, ballot, warp_slices, dep_thd_bits, dep_thd_cnt, tidY1
+
+[+
+    our ($SN, $N2, $N1);
+    return $N1 ? q{
+        132-135 : slice0I<0-3>
+        168-171 : slice1I<0-3>
+        172-183 : track0I<2-3>, track0I<4-5>, track0I<6-7>, track1I<2-3>, track1I<4-5>, track1I<6-7>
+        184-185 ~ predsI
+
+    } : $N2 ? q{
+        132-135 : slice0I<0-1>, slice1I<0-1>
+        168-171 : track0I<2-3>, track1I<2-3>
+
+    } : $SN ? q{
+        132-135 ~ slice0I, slice1I
+
+    } : q{
+        132-133 : sliceI, sliceF
+        132-133 : sliceIF<0-1>
+        132-135 : sliceI0, sliceF0, sliceI1, sliceF1
+        132-135 : slice0IF<0-1>, slice1IF<0-1>
+    };
++]
+
+    136-151 ~ posCTRS, endCTRS, endCTRS32, lutSize, lutSizeRcp, lutSizeM1, posCTRSf, channel, lutOffset0, lutOffset1, offsetIc0, offsetIc1, offsetFc0, offsetFc1, partial
+    152-167 ~ tid, idx_K, idx_M, idx_P, idx_Q, idx_N, k, n, writeFs, writeIs, readFs, readIs, swapBuf, writeOs, preds, sb_offset
+
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+
+      64-95 ~ o00_<0-3>, o04_<0-3>, o08_<0-3>, o12_<0-3>, b<00|04|08|12>, x<00|04|08|12>, bsum<00|04|08|12>
+     96-131 ~ tid_31, tid_32, alpha, readOs, MPQN16, MPQN4, k<00|04|08|12>, offset, one, M, P, Q, N, super_M, super_P, super_Q, super_N, bsum_offset
+        0-7 : Out00_<0-1>, Out04_<0-1>, Out08_<0-1>, Out12_<0-1>
+       8-15 : Sum00_<0-1>, Sum04_<0-1>, Sum08_<0-1>, Sum12_<0-1>
+      16-31 ~ out<00|04|08|12>, sum<00|04|08|12>
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,       SR_TID.X;
+--:-:2:-:1      S2R idx_MPQnk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,     SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,     SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+// tidX = (tid & 7) << 2
+// tidY = tid >> 3 << 1
+01:-:-:-:1      LOP.AND tid7,  tid,  7;
+--:-:-:-:1      SHL     tidX,  tid7, 2;
+--:-:-:-:1      SHR.U32 tid3,  tid,  3;
+--:-:-:-:1      SHL     tidY,  tid3, 1;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// idx_M = idx_MPQnk / blk_PQnk
+--:-:-:-:1      MOV  magic_PQnk, param_magic_PQnk;
+--:-:-:-:1      ISETP.NE.AND P0, PT,   magic_PQnk, 1, PT;
+02:-:-:-:1  @P0 XMAD     div1, idx_MPQnk,    magic_PQnk,    RZ;
+--:-:-:-:1  @P0 XMAD     div2, idx_MPQnk,    magic_PQnk.H1, RZ;
+--:-:-:-:1  @P0 XMAD     div3, idx_MPQnk.H1, magic_PQnk.H1, RZ;
+--:-:-:-:1  @P0 XMAD.CHI div1, idx_MPQnk.H1, magic_PQnk,    div1;
+--:-:-:-:1  @P0 IADD3.RS idx_M, div1, div2, div3;
+--:-:-:-:1  @P0 SHR.U32  idx_M, idx_M,     param_shift_PQnk;
+--:-:-:-:1 @!P0 SHR.U32  idx_M, idx_MPQnk, param_shift_PQnk;
+
+// idx_PQnk = idx_PQnk % blk_Qnk
+--:-:-:-:1      IADD neg_PQnk, RZ, -param_PQnk;
+--:-:-:-:1      XMAD.LO2 idx_PQnk, neg_PQnk, idx_M, idx_MPQnk;
+
+// idx_P2 = idx_PQnk / blk_Qnk
+--:-:-:-:1      MOV  magic_Qnk, param_magic_Qnk;
+--:-:-:-:1      ISETP.NE.AND P1, PT,  magic_Qnk, 1, PT;
+--:-:-:-:1  @P1 XMAD     div1, idx_PQnk,    magic_Qnk,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, idx_PQnk,    magic_Qnk.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, idx_PQnk.H1, magic_Qnk.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, idx_PQnk.H1, magic_Qnk,    div1;
+--:-:-:-:1  @P1 IADD3.RS idx_P2, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  idx_P2, idx_P2,   param_shift_Qnk;
+--:-:-:-:1 @!P1 SHR.U32  idx_P2, idx_PQnk, param_shift_Qnk;
+
+// idx_Qnk = idx_PQnk % blk_Qnk
+--:-:-:-:1      IADD neg_Qnk, RZ, -param_Qnk;
+--:-:-:-:1      XMAD.LO2 idx_Qnk, neg_Qnk, idx_P2, idx_PQnk;
+
+// idx_Q2  = idx_Qnk / nk
+--:-:-:-:1      XMAD.LO2C idx_Q2, idx_Qnk, param_magic_nk, RZ;
+--:-:-:-:1      SHR.U32   idx_Q2, idx_Q2,   param_shift_nk;
+// idx_nk = idx_Qnk % nk
+--:-:-:-:1      IADD neg_nk, RZ, -param_nk;
+--:-:-:-:1      XMAD.S16.U16  idx_nk, neg_nk, idx_Q2, idx_Qnk;
+
+// idx_n = idx_nk / k
+--:-:-:-:1      XMAD    idx_n,  idx_nk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_n,  idx_n,  param_shift_k;
+// idx_k = idx_nk % k
+--:-:-:-:1      IADD neg_k, RZ, -param_k;
+--:-:-:-:1      XMAD.S16.U16 idx_k, neg_k, idx_n, idx_nk;
+
+// idx_N = idx_N * blk_n + idx_n
+// idx_K = idx_K * blk_k + idx_k
+08:-:-:-:1      XMAD idx_N, idx_N, param_n, idx_n;
+04:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+--:-:-:-:1      ISCADD k, idx_K, tidX, 6;
+
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// idx_P = idx_P2 * 2
+// idx_Q = idx_Q2
+// if idx_P2 != gridP2:
+//     idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1)
+//     idx_Q  = idx_Q2 >> 1
+--:-:-:-:1      ISETP.NE.AND P1, PT, idx_P2, param_gridP2, PT;
+--:-:-:-:1      SHL idx_P, idx_P2, 1;
+--:-:-:-:1  @P1 LOP.AND q1, idx_Q2, 1;
+--:-:-:-:1  @P1 BFE.U32 q2, idx_Q2, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P1 LOP.XOR q1, q1, q2;
+--:-:-:-:1  @P1 IADD idx_P, idx_P, q1;
+--:-:-:-:1  @P1 SHR.U32 idx_Q, idx_Q2, 1;
+--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2;
+
+// Scan backwards on odd rows
+// if idx_P2 & 1:
+//     idx_Q = gridQ - idx_Q - 1
+--:-:-:-:1      LOP.AND.NZ P0, RZ, idx_P2, 1;
+--:-:-:-:1      MOV negOne, -1;
+--:-:-:-:1  @P0 IADD3 idx_Q, -idx_Q, param_gridQ, negOne;
+
+// writeFs = (tidY*64 + tidX) * 4
+--:-:-:-:1      ISCADD writeFs, tidY, tidX, 6;
+--:-:-:-:1      SHL    writeFs, writeFs, 2;
+
+// writeIs = (tidY*32 + tidX) * 4
+--:-:-:-:1      ISCADD writeIs, tidY, tidX, 5;
+--:-:-:-:1      ISCADD writeIs, writeIs, 4x<szShareF>, 2;
+
+
+// readIs  = (((tid >> 1) & 3) << 4
+--:-:-:-:1      BFE.U32 readIs, tid, 0x201; // 2 bits at position 1
+
+// readFs = (((tid & 24) >> 2) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   24;
+--:-:-:-:1      SHR.U32 readFs, readFs, 2;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+
+// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5)
+// tid32 = tid & -32
+--:-:-:-:1      LOP.AND tid32, tid, -32;
+
+// readIs2 = readIs + (tid32 >> 2) + (readFs << 2)
+--:-:-:-:1      SHR.U32 readIs2, tid32, 2;
+--:-:-:-:1      IADD    readIs2, readIs2, readIs;
+--:-:-:-:1      ISCADD  readIs2, readFs, readIs2, 2;
+
+--:-:-:-:1      SHL readFs,  readFs,  4;
+--:-:-:-:1      SHL readIs,  readIs,  4;
+--:-:-:-:1      SHL readIs2, readIs2, 4;
+
+// writeFs = readFs*32*4 + readIs2
+--:-:-:-:1      ISCADD writeOs, readFs, readIs2, 7;
+
+// Each block of 32 threads works on 8 lines,
+// readFs += tid32/4 * 64 * 4
+// readIs += tid32/4 * 32 * 4 + 4x<szShareF>
+--:-:-:-:1      ISCADD readFs, tid32,  readFs, 6;
+--:-:-:-:1      ISCADD readIs, tid32,  readIs, 5;
+--:-:-:-:1      IADD   readIs, readIs, 4x<szShareF>;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+[+
+    our $K1;
+    return $K1 ? q{
+--:-:-:-:1      IADD k0, k, 32;
+--:-:-:-:1      IADD k1, k, 33;
+--:-:-:-:1      IADD k2, k, 34;
+--:-:-:-:1      IADD k3, k, 35;
+--:-:-:-:1      ISETP.LT.AND P0, PT, k0, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, param_K, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+--:-:-:-:1      SHL preds, preds, 4;
+
+--:-:-:-:1      IADD k1, k, 1;
+--:-:-:-:1      IADD k2, k, 2;
+--:-:-:-:1      IADD k3, k, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, k,  param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, param_K, PT;
+--:-:-:-:1      P2R preds, PR, preds, 0x0f;
+    } : '';
++]
+
+[+
+    our ($SN, $N2, $N1);
+    return $N1 ? q{
+--:-:-:-:1      SHL m, idx_M, param_shiftM;
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+
+--:-:-:-:1      BFE.U32 super_m, tid7, param_superM;
+--:-:-:-:1      BFE.U32 super_p, tid7, param_superP;
+--:-:-:-:1      BFE.U32 super_q, tid7, param_superQ;
+
+--:-:-:-:1      IADD m, m, super_m;
+--:-:-:-:1      IADD p, p, super_p;
+--:-:-:-:1      ISCADD  q, super_q, q, 2;
+--:-:-:-:1      IADD q1, q, 1;
+--:-:-:-:1      IADD q2, q, 2;
+--:-:-:-:1      IADD q3, q, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, m,  param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, p,  param_P, P4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, q,  param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, q1, param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, q2, param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, q3, param_Q, P4;
+--:-:-:-:1      P2R predsI, PR, RZ, 0x0f;
+
+// warp_count = 16
+// warp_inc = 16
+// trs = tid3
+--:-:-:-:1      MOV warp_count, 16;
+--:-:-:-:1      MOV warp_inc,   16;
+--:-:-:-:1      MOV trs, tid3;
+// compute shared memory super-block offset into the lookup table
+// sb_offset = tid7 * TRS * 4 * 4
+--:-:-:-:1      XMAD sb_offset, tid7, param_TRS, RZ;
+--:-:-:-:1      SHL  sb_offset, sb_offset, 4;
+
+    } : $N2 ? q{
+
+--:-:-:-:1      SHL m, idx_M, param_shiftM;
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+
+--:-:-:-:1      BFE.U32 super_m, tid7, param_superM;
+--:-:-:-:1      BFE.U32 super_p, tid7, param_superP;
+--:-:-:-:1      BFE.U32 super_q, tid7, param_superQ;
+
+--:-:-:-:1      IADD m, m, super_m;
+--:-:-:-:1      IADD p, p, super_p;
+--:-:-:-:1      ISCADD  q, super_q, q, 1;
+--:-:-:-:1      IADD q1, q, 1;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, m,  param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, p,  param_P, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, q,  param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, q1, param_Q, P4;
+
+// warp_count = 16
+// warp_inc = 16
+// trs = tid3
+--:-:-:-:1      MOV warp_count, 16;
+--:-:-:-:1      MOV warp_inc,   16;
+--:-:-:-:1      MOV trs, tid3;
+// compute shared memory super-block offset into the lookup table
+// sb_offset = tid7 * TRS * 4 * 2
+--:-:-:-:1      XMAD sb_offset, tid7, param_TRS, RZ;
+--:-:-:-:1      SHL  sb_offset, sb_offset, 3;
+
+    } : $SN ? q{
+--:-:-:-:1      SHL m, idx_M, param_shiftM;
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+--:-:-:-:1      SHL n, idx_N, param_shiftN;
+
+--:-:-:-:1      BFE.U32 super_m, tid7, param_superM;
+--:-:-:-:1      BFE.U32 super_p, tid7, param_superP;
+--:-:-:-:1      BFE.U32 super_q, tid7, param_superQ;
+--:-:-:-:1      LOP.AND super_n, tid7, param_superN;
+
+--:-:-:-:1      IADD m, m, super_m;
+--:-:-:-:1      IADD p, p, super_p;
+--:-:-:-:1      IADD q, q, super_q;
+--:-:-:-:1      ISCADD  n, super_n, n, 2;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, m, param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, p, param_P, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, q, param_Q, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, n, param_N, P0;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P2;
+
+// sb = tid7 >> (shiftN - 2): 0-1,0-3,0-7
+--:-:-:-:1      MOV  shiftSB, param_shiftN;
+--:-:-:-:1      IADD shiftSB, shiftSB, -2;
+--:-:-:-:1      SHR.U32 sb, tid7, shiftSB;
+// warp_count = 4 << shiftN:  64,32,16
+--:-:-:-:1      MOV warp_count, 4;
+--:-:-:-:1      SHL warp_count, warp_count, param_shiftN;
+--:-:-:-:1      MOV warp_inc,   warp_count;
+// maskSB = (1 << shiftSB) - 1: 3,1,0
+--:-:-:-:1      MOV  maskSB, 1;
+--:-:-:-:1      SHL  maskSB, maskSB, shiftSB;
+--:-:-:-:1      IADD maskSB, maskSB, -1;
+// trs = tid3 << shiftSB + (tid7 & mask)
+--:-:-:-:1      LOP.AND maskSB, tid7, maskSB;
+--:-:-:-:1      SHL  trs, tid3, shiftSB;
+--:-:-:-:1      IADD trs, trs,  maskSB;
+// compute shared memory super-block offset into the lookup table
+// sb_offset = sb * TRS * 4
+--:-:-:-:1      XMAD sb_offset, sb, param_TRS, RZ;
+--:-:-:-:1      SHL  sb_offset, sb_offset, 2;
+
+    } : q{
+--:-:-:-:1      SHL n, idx_N, 5;
+--:-:-:-:1      ISCADD n, tid7, n, 2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+--:-:-:-:1      MOV trs,        tid;
+--:-:-:-:1      MOV lutStore2,  RZ;
+--:-:-:-:1      MOV lut_size,   RZ;
+--:-:-:-:1      MOV warp_count, 32;
+--:-:-:-:1      MOV warp_inc,   32;
+
+--:-:-:-:1      IADD    mask_shr, -tid, 32;
+--:-:-:-:1      SHR.U32 dep_thd_mask, negOne, mask_shr;
+
+--:-:-:-:1      ISETP.GE.AND P6, PT, tid, 32, PT;
+
+    };
++]
+--:-:-:-:1      IADD neg_RS, RZ, -param_RS;
+--:-:-:-:1      IADD neg_S,  RZ, -param_S;
+
+[+
+    our ($LN, $prop);
+    my ($m, $p, $q) = $LN ? qw(idx_M idx_P idx_Q) : qw(m p q);
+    return $prop eq 'f' ? qq{
+// mt = m * str_d - pad_d
+// pr = p * str_h - pad_h
+// qs = q * str_w - pad_w
+--:-:-:-:1      XMAD mt, $m,  param_str_d, RZ;
+--:-:-:-:1      XMAD pr, $p,  param_str_h, RZ;
+--:-:-:-:1      XMAD qs, $q,  param_str_w, RZ;
+--:-:-:-:1      IADD mt, mt, -param_pad_d;
+--:-:-:-:1      IADD pr, pr, -param_pad_h;
+--:-:-:-:1      IADD qs, qs, -param_pad_w;
+    } : qq{
+// mt = m - pad_d
+// pr = p - pad_h
+// qs = q - pad_w
+--:-:-:-:1      IADD mt, $m, -param_pad_d;
+--:-:-:-:1      IADD pr, $p, -param_pad_h;
+--:-:-:-:1      IADD qs, $q, -param_pad_w;
+
+--:-:-:-:1      IADD neg_str_d, RZ, -param_str_d;
+--:-:-:-:1      IADD neg_str_h, RZ, -param_str_h;
+--:-:-:-:1      IADD neg_str_w, RZ, -param_str_w;
+    };
++]
+</SCHEDULE_BLOCK>
+
+[+
+    our $LN; return $LN ? q{
+--:-:-:-:5  @P6 BRA.U END_SETUP;
+    } : '';
++]
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_count < RST
+--:-:-:-:1      ISETP.LT.AND P6, PT, warp_count, param_TRS, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, trs, param_TRS, PT;
+
+--:-:-:-:1      IADD warp_count, warp_count, warp_inc;
+// t =  trs / RS
+// rs = trs % RS
+--:-:-:-:1      XMAD.U16.U16 t, trs, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32      t,   t, param_shift_RS;
+--:-:-:-:1      XMAD.U16.S16 rs,  t, neg_RS, trs;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.U16.U16 r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32      r,  r, param_shift_S;
+--:-:-:-:1      XMAD.U16.S16 s,  r, neg_S, rs;
+
+[+
+    our ($SN, $N2, $N1, $prop);
+    if ($prop eq 'f')
+    {
+        return $N1 ? q{
+// x = qs + (s * dil_w)
+// y = pr + (r * dil_h)
+// z = mt + (t * dil_d)
+--:-:-:-:1      XMAD z,  t,  param_dil_d, mt;
+--:-:-:-:1      XMAD y,  r,  param_dil_h, pr;
+--:-:-:-:1      XMAD x,  s,  param_dil_w, qs;
+--:-:-:-:1      IADD x1, x,  param_str_w;
+--:-:-:-:1      IADD x2, x1, param_str_w;
+--:-:-:-:1      IADD x3, x2, param_str_w;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y, RZ, PT;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P5;
+--:-:-:-:1  @P4 R2P PR, predsI, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+--:-:-:-:1      ISETP.GE.AND  P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND  P3, PT, x3, RZ, P3;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, x,  param_W, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x2, param_W, P2;
+--:-:-:-:1      ISETP.LT.AND  P3, PT, x3, param_W, P3;
+
+// sliceI = z*HWN + y*WN + x
+01:-:-:-:1      XMAD.LO2C slice0I0, z, param_HWN, x;
+--:-:-:-:1      XMAD.LO2C slice0I0, y, param_WN,  slice0I0;
+--:-:-:-:1      IADD slice0I1, slice0I0, param_str_w;
+--:-:-:-:1      IADD slice0I2, slice0I1, param_str_w;
+--:-:-:-:1      IADD slice0I3, slice0I2, param_str_w;
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I0, -1;
+--:-:-:-:1 @!P1 MOV slice0I1, -1;
+--:-:-:-:1 @!P2 MOV slice0I2, -1;
+--:-:-:-:1 @!P3 MOV slice0I3, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 4;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+--:1:-:-:1  @P5 STS.128 [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : $N2 ? q{
+
+--:-:-:-:1      XMAD z,  t, param_dil_d, mt;
+--:-:-:-:1      XMAD y,  r, param_dil_h, pr;
+--:-:-:-:1      XMAD x,  s, param_dil_w, qs;
+--:-:-:-:1      IADD x1, x, param_str_w;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y, RZ, P5;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P3;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, x1, RZ, P4;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, x,  param_W, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, x1, param_W, P1;
+
+// sliceI = z*HWN + y*WN + x*2
+01:-:-:-:1      XMAD.LO2C slice0I0, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I0, y, param_WN,  slice0I0;
+--:-:-:-:1      ISCADD slice0I1, x1, slice0I0, 1;
+--:-:-:-:1      ISCADD slice0I0, x,  slice0I0, 1;
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I0, -1;
+--:-:-:-:1 @!P1 MOV slice0I1, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 3;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+--:1:-:-:1  @P5 STS.64 [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : $SN ? q{
+
+--:-:-:-:1      XMAD z, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x, s, param_dil_w, qs;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x, RZ, P5;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x, param_W, P2;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD.LO2C slice0I, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I, y, param_WN,  slice0I;
+--:-:-:-:1      XMAD      slice0I, x, param_N,   slice0I;
+
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 2;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+
+--:1:-:-:1  @P5 STS [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : q{
+
+--:-:-:-:1      XMAD z, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x, s, param_dil_w, qs;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x, RZ, P5;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x, param_W, P2;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+<ORDERED>
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD      sliceI, x, param_N,   sliceI;
+// sliceF = trs * K
+--:-:-:-:1      XMAD sliceF, trs, param_K, RZ;
+</ORDERED>
+
+<ORDERED>
+// Get a mask of all valid slices in the warp
+--:-:-:-:1      VOTE.ANY ballot, PT, P0;
+// Count the total valid slices
+--:-:2:-:1      POPC warp_slices, ballot;
+// Prepare lutStore for this and next loop
+--:-:-:-:1  @P0 MOV    lutStore, lutStore2;
+02:-:-:-:1      ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+--:-:-:-:1  @P0 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+--:-:3:-:1  @P0 POPC dep_thd_cnt, dep_thd_bits;
+// use the trs increment to space the barrier sync
+--:-:-:-:1      IADD trs, trs, warp_inc;
+// Update the lutStore address from this count
+04:-:-:-:1  @P0 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+--:1:-:-:1  @P0 STS.64 [lutStore + addr_lut], sliceIF;
+</ORDERED>
+// Keep track of the total size of the lut
+--:-:-:-:1      IADD lut_size, lut_size, warp_slices;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+--:1:-:-:2      STS [addr_szLut], lut_size;
+        };
+    }
+    else  # bprop
+    {
+        return $N1 ? q{
+
+// x_prime = qs + s
+// y_prime = pr + r
+// z_prime = mt + t
+--:-:-:-:1      XMAD z_prime, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y_prime, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x_prime, s, param_dil_w, qs;
+--:-:-:-:1      IADD3 x_prime1, qs, 1, s;
+--:-:-:-:1      IADD3 x_prime2, qs, 2, s;
+--:-:-:-:1      IADD3 x_prime3, qs, 3, s;
+
+// z     = z_prime / str_d
+// z_mod = z_prime % str_d
+--:-:-:-:1      XMAD    z, z_prime, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z, z,       param_shift_str_d;
+--:-:-:-:1      XMAD.U16.S16 z_mod, z, neg_str_d, z_prime;
+// y     = y_prime / str_h
+// y_mod = y_prime % str_h
+--:-:-:-:1      XMAD    y, y_prime, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y, y,       param_shift_str_h;
+--:-:-:-:1      XMAD.U16.S16 y_mod, y, neg_str_h, y_prime;
+// x     = x_prime / str_w
+// x_mod = x_prime % str_w
+--:-:-:-:1      XMAD    x, x_prime, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x, x,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod, x, neg_str_w, x_prime;
+
+--:-:-:-:1      XMAD    x1, x_prime1, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x1, x1,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod1, x1, neg_str_w, x_prime1;
+
+--:-:-:-:1      XMAD    x2, x_prime2, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x2, x2,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod2, x2, neg_str_w, x_prime2;
+
+--:-:-:-:1      XMAD    x3, x_prime3, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x3, x3,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod3, x3, neg_str_w, x_prime3;
+
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z_prime, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y_prime, RZ, PT;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, z_mod, RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, y_mod, RZ, P1;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P5;
+--:-:-:-:1  @P4 R2P PR, predsI, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,     0x0f;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, x_prime,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, x_prime1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x_prime2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND  P3, PT, x_prime3, RZ, P3;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, x,  param_W, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x2, param_W, P2;
+--:-:-:-:1      ISETP.LT.AND  P3, PT, x3, param_W, P3;
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, x_mod,  RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, x_mod1, RZ, P1;
+--:-:-:-:1      ISETP.EQ.AND  P2, PT, x_mod2, RZ, P2;
+--:-:-:-:1      ISETP.EQ.AND  P3, PT, x_mod3, RZ, P3;
+
+// sliceI = z*HWN + y*WN + x
+01:-:-:-:1      XMAD.LO2C slice0I0, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I0, y, param_WN,  slice0I0;
+--:-:-:-:1      IADD slice0I1, slice0I0, x1;
+--:-:-:-:1      IADD slice0I2, slice0I0, x2;
+--:-:-:-:1      IADD slice0I3, slice0I0, x3;
+--:-:-:-:1      IADD slice0I0, slice0I0, x;
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I0, -1;
+--:-:-:-:1 @!P1 MOV slice0I1, -1;
+--:-:-:-:1 @!P2 MOV slice0I2, -1;
+--:-:-:-:1 @!P3 MOV slice0I3, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 4;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+--:1:-:-:1  @P5 STS.128 [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : $N2 ? q{
+
+// x_prime = qs + s
+// y_prime = pr + r
+// z_prime = mt + t
+--:-:-:-:1      XMAD  z_prime, t, param_dil_d, mt;
+--:-:-:-:1      XMAD  y_prime, r, param_dil_h, pr;
+--:-:-:-:1      XMAD  x_prime, s, param_dil_w, qs;
+--:-:-:-:1      IADD3 x_prime1, qs, 1, s;
+--:-:-:-:1      IADD3 x_prime2, qs, 2, s;
+--:-:-:-:1      IADD3 x_prime3, qs, 3, s;
+
+// z     = z_prime / str_d
+// z_mod = z_prime % str_d
+--:-:-:-:1      XMAD    z, z_prime, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z, z,       param_shift_str_d;
+--:-:-:-:1      XMAD.U16.S16 z_mod, z, neg_str_d, z_prime;
+// y     = y_prime / str_h
+// y_mod = y_prime % str_h
+--:-:-:-:1      XMAD    y, y_prime, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y, y,       param_shift_str_h;
+--:-:-:-:1      XMAD.U16.S16 y_mod, y, neg_str_h, y_prime;
+// x     = x_prime / str_w
+// x_mod = x_prime % str_w
+--:-:-:-:1      XMAD    x, x_prime, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x, x,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod, x, neg_str_w, x_prime;
+
+--:-:-:-:1      XMAD    x1, x_prime1, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x1, x1,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod1, x1, neg_str_w, x_prime1;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z_prime, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y_prime, RZ, PT;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, z_mod, RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, y_mod, RZ, P1;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P3;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, x_prime,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, x_prime1, RZ, P4;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, x,  param_W, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, x_mod,  RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, x_mod1, RZ, P1;
+
+// sliceI = z*HWN + y*WN + x*2
+01:-:-:-:1      XMAD.LO2C slice0I0, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I0, y, param_WN,  slice0I0;
+--:-:-:-:1      ISCADD slice0I1, x1, slice0I0, 1;
+--:-:-:-:1      ISCADD slice0I0, x,  slice0I0, 1;
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I0, -1;
+--:-:-:-:1 @!P1 MOV slice0I1, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 3;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+--:1:-:-:1  @P5 STS.64 [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : $SN ? q{
+// x_prime = qs + s
+// y_prime = pr + r
+// z_prime = mt + t
+--:-:-:-:1      XMAD z_prime, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y_prime, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x_prime, s, param_dil_w, qs;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z_prime, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y_prime, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x_prime, RZ, P5;
+
+// z       = z_prime / str_d
+// z_prime = z_prime % str_d
+--:-:-:-:1      XMAD    z, z_prime, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z, z,       param_shift_str_d;
+--:-:-:-:1      XMAD.U16.S16 z_prime, z, neg_str_d, z_prime;
+// y       = y_prime / str_h
+// y_prime = y_prime % str_h
+--:-:-:-:1      XMAD    y, y_prime, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y, y,       param_shift_str_h;
+--:-:-:-:1      XMAD.U16.S16 y_prime, y, neg_str_h, y_prime;
+// x       = x_prime / str_w
+// x_prime = x_prime % str_w
+--:-:-:-:1      XMAD    x, x_prime, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x, x,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_prime, x, neg_str_w, x_prime;
+
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, z_prime, RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, y_prime, RZ, P1;
+--:-:-:-:1      ISETP.EQ.AND  P2, PT, x_prime, RZ, P2;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x, param_W, P2;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD.LO2C slice0I, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I, y, param_WN,  slice0I;
+--:-:-:-:1      XMAD      slice0I, x, param_N,   slice0I;
+
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 2;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+
+--:1:-:-:1  @P5 STS [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : q{
+// x_prime = qs + s
+// y_prime = pr + r
+// z_prime = mt + t
+--:-:-:-:1      XMAD z_prime, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y_prime, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x_prime, s, param_dil_w, qs;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z_prime, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y_prime, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x_prime, RZ, P5;
+
+// z       = z_prime / str_d
+// z_prime = z_prime % str_d
+--:-:-:-:1      XMAD    z, z_prime, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z, z,       param_shift_str_d;
+--:-:-:-:1      XMAD.U16.S16 z_prime, z, neg_str_d, z_prime;
+// y       = y_prime / str_h
+// y_prime = y_prime % str_h
+--:-:-:-:1      XMAD    y, y_prime, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y, y,       param_shift_str_h;
+--:-:-:-:1      XMAD.U16.S16 y_prime, y, neg_str_h, y_prime;
+// x       = x_prime / str_w
+// x_prime = x_prime % str_w
+--:-:-:-:1      XMAD    x, x_prime, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x, x,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_prime, x, neg_str_w, x_prime;
+
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, z_prime, RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, y_prime, RZ, P1;
+--:-:-:-:1      ISETP.EQ.AND  P2, PT, x_prime, RZ, P2;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x, param_W, P2;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+<ORDERED>
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD      sliceI, x, param_N,   sliceI;
+// sliceF = trs * K
+--:-:-:-:1      XMAD sliceF, trs, param_K, RZ;
+</ORDERED>
+
+<ORDERED>
+// Get a mask of all valid slices in the warp
+--:-:-:-:1      VOTE.ANY ballot, PT, P0;
+// Count the total valid slices
+--:-:2:-:1      POPC warp_slices, ballot;
+// Prepare lutStore for this and next loop
+--:-:-:-:1  @P0 MOV    lutStore, lutStore2;
+02:-:-:-:1      ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+--:-:-:-:1  @P0 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+--:-:3:-:1  @P0 POPC dep_thd_cnt, dep_thd_bits;
+// use the trs increment to space the barrier sync
+--:-:-:-:1      IADD trs, trs, warp_inc;
+// Update the lutStore address from this count
+04:-:-:-:1  @P0 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+--:1:-:-:1  @P0 STS.64 [lutStore + addr_lut], sliceIF;
+</ORDERED>
+// Keep track of the total size of the lut
+--:-:-:-:1      IADD lut_size, lut_size, warp_slices;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+--:1:-:-:2      STS [addr_szLut], lut_size;
+        };
+    }
++]
+
+END_SETUP:
+
+01:-:-:-:5      BAR.SYNC 0;
+
+// Grab the caclulated lut size and get it's reciprical
+// Get the total reduction depth
+[+
+    our $LN; return $LN ? q{
+--:-:1:-:2      LDS lutSize, [addr_szLut];
+    } : q{
+--:-:-:-:6      MOV lutSize, param_TRS;
+    };
++]
+01:-:-:-:0      XMAD endCTRS, lutSize, param_C, RZ;
+--:-:1:-:2      I2F.F32.S32 lutSizeRcp, lutSize;
+--:-:-:-:0      IADD lutSizeM1, lutSize, -1;
+01:-:1:-:1      MUFU.RCP lutSizeRcp, lutSizeRcp;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD endCTRS32, endCTRS, 32;
+// posCTRS = tidY
+//--:-:-:-:1      MOV posCTRS, tidY;
+// If this value is not a multiple of 32 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 32 then make a full 32 line fetch.
+--:-:-:-:1      LOP.AND.Z P5, partial, endCTRS, 31;
+--:-:-:-:1  @P5 MOV partial, 32;
+// channel = posCTRS / lutSize
+// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
+--:-:2:-:1      I2F.F32.S32 posCTRSf, tidY;
+03:-:-:-:1      FMUL channel, posCTRSf, lutSizeRcp;
+--:-:-:-:1      FFMA channel, channel, 5.9604644775390625e-08, channel;
+--:-:2:-:1      F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCTRS % lutSize) * 8
+02:-:-:-:1      VMAD.U16.U16 lutOffset0, -channel, lutSize, tidY;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;
+
+// posCTRS = tidY + partial
+--:-:-:-:1      IADD posCTRS, tidY, partial;
+--:-:-:-:1      IADD tidY1, tidY, 1;
+[+
+    our ($N1, $N2, $LN, $dshift, $slice_scale, $slice_offset, $slice_load);
+    return $LN ? q{
+// P5 = tidY < partial && lutSize != 0
+--:-:-:-:1      LOP.AND.NZ P6, RZ, lutSize, -1;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY,  partial, P6;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY1, partial, P6;
+
+--:-:-:-:1      SHL lutOffset0, lutOffset0, 3;
+
+// offsetFC = channel * KRST
+// offsetIC = channel * DHWN
+--:-:-:-:1      XMAD.LO2C offsetIc0, channel, param_DHWN, RZ;
+--:-:-:-:1      XMAD      offsetFc0, channel, param_TRSK, RZ;
+
+--:-:-:-:1  @P0 IADD lutOffset1, lutOffset0, 8;
+--:-:-:-:1  @P0 MOV  offsetFc1, offsetFc0;
+--:-:-:-:1  @P0 MOV  offsetIc1, offsetIc0;
+--:-:-:-:1 @!P0 MOV  lutOffset1, RZ;
+--:-:-:-:1 @!P0 IADD offsetFc1, offsetFc0, param_TRSK;
+--:-:-:-:1 @!P0 IADD offsetIc1, offsetIc0, param_DHWN;
+
+--:-:5:-:1  @P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut];
+--:-:6:-:1  @P6 LDS.U.64 slice1IF, [lutOffset1 + addr_lut];
+    } : qq{
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY,  partial, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY1, partial, PT;
+
+--:-:-:-:1      XMAD offsetFc0, tidY,  param_K, RZ;
+--:-:-:-:1      XMAD offsetFc1, tidY1, param_K, RZ;
+
+--:-:-:-:1      XMAD partial, partial,  param_K, RZ;
+--:-:-:-:1      SHL partial, partial, $dshift;
+
+--:-:-:-:1      ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale;
+--:-:-:-:1      XMAD.LO2C offsetIc0, channel, param_DHWN, RZ;
+
+--:-:-:-:1  \@P0 IADD lutOffset1, lutOffset0, $slice_offset;
+--:-:-:-:1  \@P0 MOV  offsetIc1, offsetIc0;
+--:-:-:-:1 \@!P0 MOV  lutOffset1, sb_offset;
+--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;
+
+--:-:5:-:1  \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4];
+--:-:6:-:1  \@P6 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4];
+    };
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    our ($LN, $dshift);
+    return $LN ? qq{
+10:-:-:-:1      IADD3 offsetFc0, offsetFc0, sliceF0, k;
+--:-:-:-:1      LEA      track0F0.CC, offsetFc0, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0F1,    offsetFc0, param_F[1], RZ, $dshift;
+
+20:-:-:-:1      IADD3 offsetFc1, offsetFc1, sliceF1, k;
+--:-:-:-:1      LEA      track1F0.CC, offsetFc1, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1F1,    offsetFc1, param_F[1], RZ, $dshift;
+    } : qq{
+--:-:-:-:1      IADD  offsetFc0, offsetFc0, k;
+--:-:-:-:1      IADD  offsetFc1, offsetFc1, k;
+--:-:-:-:1      LEA      track0F0.CC, offsetFc0, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0F1,    offsetFc0, param_F[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1F0.CC, offsetFc1, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1F1,    offsetFc1, param_F[1], RZ, $dshift;
+    };
++]
+[+
+    our ($K1, $dtype, $vsize, $dsize);
+    return $K1 ? qq{
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+--:-:-:-:1 \@!P0 MOV F00, RZ;
+--:-:-:-:1 \@!P1 MOV F01, RZ;
+--:-:-:-:1 \@!P2 MOV F02, RZ;
+--:-:-:-:1 \@!P3 MOV F03, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>];
+--:-:1:-:1  \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>];
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1      SHF.L.U64 preds, preds, 4, preds;
+--:-:-:-:1 \@!P0 MOV F10, RZ;
+--:-:-:-:1 \@!P1 MOV F11, RZ;
+--:-:-:-:1 \@!P2 MOV F12, RZ;
+--:-:-:-:1 \@!P3 MOV F13, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>];
+
+--:-:-:-:1  \@P6 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P6 R2P PR,    RZ, 0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+--:-:-:-:1 \@!P0 MOV F20, RZ;
+--:-:-:-:1 \@!P1 MOV F21, RZ;
+--:-:-:-:1 \@!P2 MOV F22, RZ;
+--:-:-:-:1 \@!P3 MOV F23, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>];
+--:-:3:-:1  \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>];
+
+--:-:-:-:1  \@P6 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P6 R2P PR,    RZ, 0x0f;
+--:-:-:-:1      SHF.L.U64 preds, preds, 4, preds;
+--:-:-:-:1 \@!P0 MOV F30, RZ;
+--:-:-:-:1 \@!P1 MOV F31, RZ;
+--:-:-:-:1 \@!P2 MOV F32, RZ;
+--:-:-:-:1 \@!P3 MOV F33, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>];
+--:-:4:-:1  \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>];
+    } : qq{
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K,    P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k, param_Km32, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K,    P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k, param_Km32, P6;
+
+<ORDERED>
+--:-:1:-:1  \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>];
+--:-:2:-:1  \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>];
+--:-:3:-:1  \@P2 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>];
+--:-:4:-:1  \@P3 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>];
+
+--:-:-:-:1 \@!P0 LDS.U.$vsize F0, [addr_zero];
+--:-:-:-:1 \@!P1 LDS.U.$vsize F1, [addr_zero];
+--:-:-:-:1 \@!P2 LDS.U.$vsize F2, [addr_zero];
+--:-:1:-:1 \@!P3 LDS.U.$vsize F3, [addr_zero];
+</ORDERED>
+    };
++]
+</SCHEDULE_BLOCK>
+<SCHEDULE_BLOCK>
+[+
+    our ($N1, $N2, $SN, $dshift, $vsizeI);
+    return $N1 ? qq{
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I2, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice0I3, RZ, P5;
+--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;
+--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;
+--:-:-:-:1      IADD slice0I2, slice0I2, offsetIc0;
+--:-:-:-:1      IADD slice0I3, slice0I3, offsetIc0;
+--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I4.CC, slice0I2,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I5,    slice0I2,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I6.CC, slice0I3,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I7,    slice0I3,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];
+--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I01, [track0I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I02, [track0I4];
+--:-:5:-:1  \@P3 LDG.E.CI.$vsizeI I03, [track0I6];
+</ORDERED>
+--:-:-:-:1 \@!P0 MOV I00, RZ;
+--:-:-:-:1 \@!P1 MOV I01, RZ;
+--:-:-:-:1 \@!P2 MOV I02, RZ;
+--:-:-:-:1 \@!P3 MOV I03, RZ;
+
+20:-:-:-:1      ISETP.GE.AND P0, PT, slice1I0, RZ, P6;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice1I1, RZ, P6;
+--:-:-:-:1      ISETP.GE.AND P2, PT, slice1I2, RZ, P6;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I3, RZ, P6;
+--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;
+--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;
+--:-:-:-:1      IADD slice1I2, slice1I2, offsetIc1;
+--:-:-:-:1      IADD slice1I3, slice1I3, offsetIc1;
+--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I4.CC, slice1I2,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I5,    slice1I2,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I6.CC, slice1I3,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I7,    slice1I3,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I10, [track1I0];
+--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I11, [track1I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I12, [track1I4];
+--:-:6:-:1  \@P3 LDG.E.CI.$vsizeI I13, [track1I6];
+</ORDERED>
+--:-:-:-:1 \@!P0 MOV I10, RZ;
+--:-:-:-:1 \@!P1 MOV I11, RZ;
+--:-:-:-:1 \@!P2 MOV I12, RZ;
+--:-:-:-:1 \@!P3 MOV I13, RZ;
+
+    } : $N2 ? qq{
+
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;
+20:-:-:-:1      ISETP.GE.AND P2, PT, slice1I0, RZ, P6;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I1, RZ, P6;
+--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;
+--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;
+--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;
+--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;
+--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];
+--:-:5:-:1  \@P1 LDG.E.CI.$vsizeI I02, [track0I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I10, [track1I0];
+--:-:6:-:1  \@P3 LDG.E.CI.$vsizeI I12, [track1I2];
+--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero];
+--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero];
+--:-:-:-:1 \@!P2 LDS.U.$vsizeI I10, [addr_zero];
+--:-:5:-:1 \@!P3 LDS.U.$vsizeI I12, [addr_zero];
+</ORDERED>
+
+    } : $SN ? qq{
+
+10:-:-:-:1      ISETP.GE.AND P5, PT, slice0I, RZ, P5;
+20:-:-:-:1      ISETP.GE.AND P6, PT, slice1I, RZ, P6;
+--:-:-:-:1      IADD3 slice0I, slice0I, offsetIc0, n;
+--:-:-:-:1      IADD3 slice1I, slice1I, offsetIc1, n;
+--:-:-:-:1      LEA      track0I0.CC, slice0I,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, slice1I,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:5:-:1  \@P5 LDG.E.CI.$vsizeI I0, [track0I];
+--:-:6:-:1  \@P6 LDG.E.CI.$vsizeI I1, [track1I];
+--:-:-:-:1 \@!P5 LDS.U.$vsizeI I0, [addr_zero];
+--:-:5:-:1 \@!P6 LDS.U.$vsizeI I1, [addr_zero];
+</ORDERED>
+
+    } : qq{
+--:-:-:-:1      IADD3 offsetIc0, offsetIc0, sliceI0, n;
+--:-:-:-:1      IADD3 offsetIc1, offsetIc1, sliceI1, n;
+--:-:-:-:1      LEA      track0I0.CC, offsetIc0, param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    offsetIc0, param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, offsetIc1, param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    offsetIc1, param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:5:-:1  \@P5 LDG.E.CI.$vsizeI I0, [track0I];
+--:-:6:-:1  \@P6 LDG.E.CI.$vsizeI I1, [track1I];
+--:-:-:-:1 \@!P5 LDS.U.$vsizeI I0, [addr_zero];
+--:-:5:-:1 \@!P6 LDS.U.$vsizeI I1, [addr_zero];
+</ORDERED>
+    };
++]
+</SCHEDULE_BLOCK>
+
+[+
+    our ($convert_in, $K1);
+    return !$convert_in ? '' : $K1 ? qq{
+01:-:-:-:1      $convert_in F00, F00;
+--:-:-:-:1      $convert_in F01, F01;
+--:-:-:-:1      $convert_in F02, F02;
+--:-:1:-:1      $convert_in F03, F03;
+
+02:-:-:-:1      $convert_in F10, F10;
+--:-:-:-:1      $convert_in F11, F11;
+--:-:-:-:1      $convert_in F12, F12;
+--:-:2:-:1      $convert_in F13, F13;
+
+04:-:-:-:1      $convert_in F20, F20;
+--:-:-:-:1      $convert_in F21, F21;
+--:-:-:-:1      $convert_in F22, F22;
+--:-:3:-:1      $convert_in F23, F23;
+
+08:-:-:-:1      $convert_in F30, F30;
+--:-:-:-:1      $convert_in F31, F31;
+--:-:-:-:1      $convert_in F32, F32;
+--:-:4:-:1      $convert_in F33, F33;
+    } : qq{
+01:-:-:-:1      $convert_in F03, F01.H1;
+--:-:-:-:1      $convert_in F02, F01.H0;
+--:-:-:-:1      $convert_in F01, F00.H1;
+--:-:1:-:1      $convert_in F00, F00.H0;
+
+02:-:-:-:1      $convert_in F13, F11.H1;
+--:-:-:-:1      $convert_in F12, F11.H0;
+--:-:-:-:1      $convert_in F11, F10.H1;
+--:-:2:-:1      $convert_in F10, F10.H0;
+
+04:-:-:-:1      $convert_in F23, F21.H1;
+--:-:-:-:1      $convert_in F22, F21.H0;
+--:-:-:-:1      $convert_in F21, F20.H1;
+--:-:3:-:1      $convert_in F20, F20.H0;
+
+08:-:-:-:1      $convert_in F33, F31.H1;
+--:-:-:-:1      $convert_in F32, F31.H0;
+--:-:-:-:1      $convert_in F31, F30.H1;
+--:-:4:-:1      $convert_in F30, F30.H0;
+    };
++]
+[+
+    our ($convert_in, $N1, $N2);
+    return !$convert_in ? '' : $N1 ? qq{
+10:-:-:-:1      $convert_in I03, I03;
+--:-:-:-:1      $convert_in I02, I02;
+--:-:-:-:1      $convert_in I01, I01;
+--:-:5:-:1      $convert_in I00, I00;
+
+20:-:-:-:1      $convert_in I13, I13;
+--:-:-:-:1      $convert_in I12, I12;
+--:-:-:-:1      $convert_in I11, I11;
+--:-:6:-:1      $convert_in I10, I10;
+    } : $N2 ? qq{
+10:-:-:-:1      $convert_in I03, I02.H1;
+--:-:-:-:1      $convert_in I02, I02.H0;
+--:-:-:-:1      $convert_in I01, I00.H1;
+--:-:5:-:1      $convert_in I00, I00.H0;
+
+20:-:-:-:1      $convert_in I13, I12.H1;
+--:-:-:-:1      $convert_in I12, I12.H0;
+--:-:-:-:1      $convert_in I11, I10.H1;
+--:-:6:-:1      $convert_in I10, I10.H0;
+    } : qq{
+10:-:-:-:1      $convert_in I03, I01.H1;
+--:-:-:-:1      $convert_in I02, I01.H0;
+--:-:-:-:1      $convert_in I01, I00.H1;
+--:-:5:-:1      $convert_in I00, I00.H0;
+
+20:-:-:-:1      $convert_in I13, I11.H1;
+--:-:-:-:1      $convert_in I12, I11.H0;
+--:-:-:-:1      $convert_in I11, I10.H1;
+--:-:6:-:1      $convert_in I10, I10.H0;
+    };
++]
+
+01:-:-:-:1      STS.128 [writeFs + 4x<0*32>], F0;
+02:-:-:-:1      STS.128 [writeFs + 4x<1*32>], F1;
+04:-:-:-:1      STS.128 [writeFs + 4x<2*32>], F2;
+08:-:-:-:1      STS.128 [writeFs + 4x<3*32>], F3;
+
+10:-:-:-:1      STS.128 [writeIs + 4x<0*32>], I0;
+20:-:-:-:1      STS.128 [writeIs + 4x<1*32>], I1;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, posCTRS, endCTRS, PT;
+--:-:5:-:1      I2F.F32.S32 posCTRSf, posCTRS;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;
+
+<ORDERED>
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*32 + 16>];
+</ORDERED>
+
+10:-:-:-:1      FMUL channel, posCTRSf, lutSizeRcp;
+--:-:-:-:1      FFMA channel, channel, 5.9604644775390625e-08, channel;
+--:-:5:-:1      F2I.S32.F32.TRUNC channel, channel;
+
+10:-:-:-:1      VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;
+--:-:-:-:1      ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;
+[+
+    our ($N1, $N2, $LN, $dshift, $slice_scale, $slice_offset, $slice_load);
+    return $LN ? q{
+
+--:-:-:-:1      SHL lutOffset0, lutOffset0, 3;
+--:-:-:-:1      XMAD.LO2C offsetIc0, channel, param_DHWN, RZ;
+--:-:-:-:1      XMAD      offsetFc0, channel, param_TRSK, RZ;
+
+--:-:-:-:1  @P0 IADD lutOffset1, lutOffset0, 8;
+--:-:-:-:1  @P0 MOV  offsetFc1, offsetFc0;
+--:-:-:-:1  @P0 MOV  offsetIc1, offsetIc0;
+--:-:-:-:1 @!P0 MOV  lutOffset1, RZ;
+--:-:-:-:1 @!P0 IADD offsetFc1, offsetFc0, param_TRSK;
+--:-:-:-:1 @!P0 IADD offsetIc1, offsetIc0, param_DHWN;
+
+--:-:-:-:1      IADD posCTRS, posCTRS, 32;
+--:-:5:-:1  @P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut];
+--:-:6:-:1  @P5 LDS.U.64 slice1IF, [lutOffset1 + addr_lut];
+
+    } : qq{
+
+--:-:-:-:1      ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale;
+--:-:-:-:1      XMAD.LO2C offsetIc0, channel, param_DHWN, RZ;
+
+--:-:-:-:1  \@P0 IADD lutOffset1, lutOffset0, $slice_offset;
+--:-:-:-:1  \@P0 MOV  offsetIc1, offsetIc0;
+--:-:-:-:1 \@!P0 MOV  lutOffset1, sb_offset;
+--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;
+
+--:-:-:-:1      IADD posCTRS, posCTRS, 32;
+--:-:5:-:1  \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4];
+--:-:6:-:1  \@P5 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4];
+    };
++]
+
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    our ($LN, $dshift);
+    return $LN ? qq{
+10:-:-:-:1      IADD3 offsetFc0, offsetFc0, sliceF0, k;
+--:-:-:-:1      LEA      track0F0.CC, offsetFc0, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0F1,    offsetFc0, param_F[1], RZ, $dshift;
+
+20:-:-:-:1      IADD3 offsetFc1, offsetFc1, sliceF1, k;
+--:-:-:-:1      LEA      track1F0.CC, offsetFc1, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1F1,    offsetFc1, param_F[1], RZ, $dshift;
+    } : qq{
+--:-:-:-:1      IADD   track0F0.CC, track0F0, partial;
+--:-:-:-:1      IADD.X track0F1,    track0F1, RZ;
+--:-:-:-:1      IADD   track1F0.CC, track1F0, partial;
+--:-:-:-:1      IADD.X track1F1,    track1F1, RZ;
+    };
++]
+<ORDERED>
+[+
+    our ($K1, $dtype, $vsize, $dsize);
+    return $K1 ? qq{
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1  \@P5 SHF.R.U64 preds, preds, 4, preds;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>];
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1  \@P5 SHF.L.U64 preds, preds, 4, preds;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>];
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1  \@P5 SHF.R.U64 preds, preds, 4, preds;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>];
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1  \@P5 SHF.L.U64 preds, preds, 4, preds;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>];
+    } : qq{
+--:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K,    P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k, param_Km32, P5;
+
+--:-:2:-:1  \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>];
+--:-:2:-:1  \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>];
+--:-:2:-:1  \@P0 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>];
+--:-:2:-:1  \@P1 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>];
+    };
++]
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    our ($N1, $N2, $SN, $dshift, $vsizeI);
+    return $N1 ? qq{
+<ORDERED>
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I2, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice0I3, RZ, P5;
+</ORDERED>
+--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;
+--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;
+--:-:-:-:1      IADD slice0I2, slice0I2, offsetIc0;
+--:-:-:-:1      IADD slice0I3, slice0I3, offsetIc0;
+--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I4.CC, slice0I2,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I5,    slice0I2,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I6.CC, slice0I3,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I7,    slice0I3,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];
+--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I01, [track0I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I02, [track0I4];
+--:-:2:-:1  \@P3 LDG.E.CI.$vsizeI I03, [track0I6];
+</ORDERED>
+--:-:-:-:1 \@!P0 MOV I00, RZ;
+--:-:-:-:1 \@!P1 MOV I01, RZ;
+--:-:-:-:1 \@!P2 MOV I02, RZ;
+--:-:-:-:1 \@!P3 MOV I03, RZ;
+
+<ORDERED>
+20:-:-:-:1      ISETP.GE.AND P0, PT, slice1I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice1I1, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P2, PT, slice1I2, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I3, RZ, P5;
+</ORDERED>
+--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;
+--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;
+--:-:-:-:1      IADD slice1I2, slice1I2, offsetIc1;
+--:-:-:-:1      IADD slice1I3, slice1I3, offsetIc1;
+--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I4.CC, slice1I2,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I5,    slice1I2,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I6.CC, slice1I3,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I7,    slice1I3,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I10, [track1I0];
+--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I11, [track1I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I12, [track1I4];
+--:3:2:-:1  \@P3 LDG.E.CI.$vsizeI I13, [track1I6];
+</ORDERED>
+--:-:-:-:1 \@!P0 MOV I10, RZ;
+--:-:-:-:1 \@!P1 MOV I11, RZ;
+--:-:-:-:1 \@!P2 MOV I12, RZ;
+--:-:-:-:1 \@!P3 MOV I13, RZ;
+
+    } : $N2 ? qq{
+<ORDERED>
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;
+20:-:-:-:1      ISETP.GE.AND P2, PT, slice1I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I1, RZ, P5;
+</ORDERED>
+--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;
+--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;
+--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;
+--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;
+--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];
+--:-:2:-:1  \@P1 LDG.E.CI.$vsizeI I02, [track0I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I10, [track1I0];
+--:3:2:-:1  \@P3 LDG.E.CI.$vsizeI I12, [track1I2];
+--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero];
+--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero];
+--:-:-:-:1 \@!P2 LDS.U.$vsizeI I10, [addr_zero];
+--:-:-:-:1 \@!P3 LDS.U.$vsizeI I12, [addr_zero];
+</ORDERED>
+
+    } : $SN ? qq{
+
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I, RZ, P5;
+20:-:-:-:1      ISETP.GE.AND P1, PT, slice1I, RZ, P5;
+--:-:-:-:1      IADD3 slice0I, slice0I, offsetIc0, n;
+--:-:-:-:1      IADD3 slice1I, slice1I, offsetIc1, n;
+--:-:-:-:1      LEA      track0I0.CC, slice0I,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, slice1I,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:2:-:1  \@P0 LDG.E.CI.$vsizeI I0, [track0I];
+--:3:2:-:1  \@P1 LDG.E.CI.$vsizeI I1, [track1I];
+--:-:-:-:1 \@!P0 LDS.U.$vsizeI I0, [addr_zero];
+--:-:-:-:1 \@!P1 LDS.U.$vsizeI I1, [addr_zero];
+</ORDERED>
+
+    } : qq{
+--:-:-:-:1      IADD3 offsetIc0, offsetIc0, sliceI0, n;
+--:-:-:-:1      IADD3 offsetIc1, offsetIc1, sliceI1, n;
+--:-:-:-:1      LEA      track0I0.CC, offsetIc0, param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    offsetIc0, param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, offsetIc1, param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    offsetIc1, param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:2:-:1  \@P5 LDG.E.CI.$vsizeI I0, [track0I];
+--:3:2:-:1  \@P5 LDG.E.CI.$vsizeI I1, [track1I];
+</ORDERED>
+    };
++]
+</SCHEDULE_BLOCK>
+
+LOOP:
+[+
+    our ($N1, $N2, $SN, $LN, $K1, $dtype, $dshift, $dsize, $vsize, $vsizeI,
+         $convert_in, $slice_scale, $slice_offset, $slice_load);
+
+    my %insert = (
+        j0c1  => "--:-:5:-:1      I2F.F32.S32 posCTRSf, posCTRS;\n",
+        j0c3  => "--:-:-:-:1      ISETP.LT.AND P5, PT, posCTRS, endCTRS,   PT;\n",
+        j0c5  => "--:-:-:-:1      ISETP.LT.AND P6, PT, posCTRS, endCTRS32, PT;\n",
+
+        j0c15 => "10:-:-:-:1  \@P5 FMUL channel, posCTRSf, lutSizeRcp;\n",
+        j0c20 => "--:-:-:-:1  \@P5 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c22 => "--:-:5:-:1  \@P5 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        $LN ? (
+            j0c36 => "10:-:-:-:1  \@P5 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;\n" .
+                     "--:-:-:-:1  \@P5 XMAD offsetIc0, channel, param_DHWN, RZ;\n" .
+                     "--:-:-:-:1  \@P5 XMAD offsetFc0, channel, param_TRSK, RZ;\n" .
+                     "--:-:-:-:1      IADD posCTRS, posCTRS, 32;\n",
+
+            j0c38 => "--:-:-:-:1  \@P5 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;\n" .
+                     "--:-:-:-:1  \@P5 XMAD.PSL offsetIc0, channel, param_DHWN.H1, offsetIc0;\n" .
+                     "--:-:-:-:1  \@P5 SHL lutOffset0, lutOffset0, 3;\n",
+
+            j0c42 => "--:-:5:-:1  \@P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut];\n",
+
+            j0c49 => "--:-:-:-:1  \@P0 I2I.U32.U32 offsetFc1, offsetFc0;\n" .
+                     "--:-:-:-:1 \@!P0 IADD offsetFc1, offsetFc0, param_TRSK;\n",
+
+            j0c50 => "--:-:-:-:1  \@P0 I2I.U32.U32 offsetIc1, offsetIc0;\n" .
+                     "--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;\n",
+
+            j0c51 => "--:-:4:-:1 \@!P0 I2I.U32.U32 lutOffset1, RZ;\n" .
+                     "--:-:-:-:1  \@P0 IADD lutOffset1, lutOffset0, 8;\n",
+
+            j1c44 => "10:-:-:-:1  \@P5 IADD3 offsetFc0, offsetFc0, sliceF0, k;\n",
+            j1c49 => "04:-:-:-:1  \@P5 LEA      track0F0.CC, offsetFc0, param_F[0],     $dshift;\n",
+            j1c54 => "--:-:-:-:1  \@P5 LEA.HI.X track0F1,    offsetFc0, param_F[1], RZ, $dshift;\n",
+
+            j2c16 => "08:-:5:-:1  \@P5 LDS.U.64 slice1IF, [lutOffset1 + addr_lut];\n",
+
+            j3c44 => "10:-:-:-:1  \@P5 IADD3 offsetFc1, offsetFc1, sliceF1, k;\n",
+            j3c49 => "--:-:-:-:1  \@P5 LEA      track1F0.CC, offsetFc1, param_F[0],     $dshift;\n",
+            j3c54 => "--:-:-:-:1  \@P5 LEA.HI.X track1F1,    offsetFc1, param_F[1], RZ, $dshift;\n",
+
+            j5c44 => "--:-:-:-:1  \@P5 IADD3 offsetIc0, offsetIc0, sliceI0, n;\n",
+            j5c49 => "--:-:-:-:1  \@P5 LEA      track0I0.CC, offsetIc0, param_I[0],     $dshift;\n",
+            j5c54 => "--:-:-:-:1  \@P5 LEA.HI.X track0I1,    offsetIc0, param_I[1], RZ, $dshift;\n",
+            j5c60 => "20:-:2:-:1  \@P5 LDG.E.CI.$vsize I0, [track0I];\n",
+
+            j6c44 => "--:-:-:-:1  \@P5 IADD3 offsetIc1, offsetIc1, sliceI1, n;\n",
+            j6c49 => "--:-:-:-:1  \@P5 LEA      track1I0.CC, offsetIc1, param_I[0],     $dshift;\n",
+            j6c54 => "--:-:-:-:1  \@P5 LEA.HI.X track1I1,    offsetIc1, param_I[1], RZ, $dshift;\n",
+            j6c60 => "20:3:2:-:1  \@P5 LDG.E.CI.$vsize I1, [track1I];\n",
+
+        ) : (
+            j0c36 => "10:-:-:-:1  \@P5 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;\n" .
+                     "--:-:-:-:1  \@P5 XMAD offsetIc0, channel, param_DHWN, RZ;\n" .
+                     "--:-:-:-:1      IADD posCTRS, posCTRS, 32;\n",
+
+            j0c39 => "--:-:-:-:1  \@P5 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;\n" .
+                     "--:-:-:-:1  \@P5 XMAD.PSL offsetIc0, channel, param_DHWN.H1, offsetIc0;\n" .
+                     "--:-:-:-:1  \@P5 ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale;\n",
+
+            j0c43 => "--:-:-:-:1  \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4];\n",
+
+            j0c50 => "--:-:-:-:1  \@P0 I2I.U32.U32 offsetIc1, offsetIc0;\n" .
+                     "--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;\n",
+
+            j0c51 => "--:-:4:-:1 \@!P0 I2I.U32.U32 lutOffset1, sb_offset;\n" .
+                     "--:-:-:-:1  \@P0 IADD lutOffset1, lutOffset0, $slice_offset;\n",
+
+            j2c16 => "08:-:-:-:1  \@P5 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4];\n",
+
+            j1c49 => "04:-:-:-:1  \@P5 IADD   track0F0.CC, track0F0, param_K32p;\n",
+            j1c54 => "--:-:-:-:1  \@P5 IADD.X track0F1,    track0F1, RZ;\n",
+
+            j3c49 => "--:-:-:-:1  \@P5 IADD   track1F0.CC, track1F0, param_K32p;\n",
+            j3c54 => "--:-:-:-:1  \@P5 IADD.X track1F1,    track1F1, RZ;\n",
+        ),
+
+        $N1 ? (
+
+            j5c31 => "--:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I2, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I2, slice0I2, offsetIc0;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P3, PT, slice0I3, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I3, slice0I3, offsetIc0;\n",
+
+            j5c32 => "--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;\n",
+            j5c37 => "--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;\n",
+            j5c42 => "--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track0I4.CC, slice0I2,   param_I[0],     $dshift;\n",
+            j5c47 => "--:-:-:-:1      LEA.HI.X track0I5,    slice0I2,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track0I6.CC, slice0I3,   param_I[0],     $dshift;\n",
+            j5c52 => "--:-:-:-:1      LEA.HI.X track0I7,    slice0I3,   param_I[1], RZ, $dshift;\n",
+
+            j5c55 => "20:-:-:-:1 \@!P0 I2I.U32.U32 I00, RZ;\n",
+            j5c57 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I01, RZ;\n",
+            j5c59 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I02, RZ;\n",
+            j5c61 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I03, RZ;\n",
+
+            j5c56 => "--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];\n",
+            j5c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I01, [track0I2];\n",
+            j5c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I02, [track0I4];\n",
+            j5c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$vsizeI I03, [track0I6];\n",
+
+            j6c31 => "--:-:-:-:1      ISETP.GE.AND P0, PT, slice1I0, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P1, PT, slice1I1, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P2, PT, slice1I2, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I2, slice1I2, offsetIc1;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I3, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I3, slice1I3, offsetIc1;\n",
+
+            j6c32 => "--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;\n",
+            j6c37 => "--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;\n",
+            j6c42 => "--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track1I4.CC, slice1I2,   param_I[0],     $dshift;\n",
+            j6c47 => "--:-:-:-:1      LEA.HI.X track1I5,    slice1I2,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track1I6.CC, slice1I3,   param_I[0],     $dshift;\n",
+            j6c52 => "--:-:-:-:1      LEA.HI.X track1I7,    slice1I3,   param_I[1], RZ, $dshift;\n",
+
+            j6c55 => "20:-:-:-:1 \@!P0 I2I.U32.U32 I10, RZ;\n",
+            j6c57 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I11, RZ;\n",
+            j6c59 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I12, RZ;\n",
+            j6c61 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I13, RZ;\n",
+
+            j6c56 => "--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I10, [track1I0];\n",
+            j6c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I11, [track1I2];\n",
+            j6c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I12, [track1I4];\n",
+            j6c62 => "--:3:2:-:1  \@P3 LDG.E.CI.$vsizeI I13, [track1I6];\n",
+
+        ) : $N2 ? (
+
+            j5c31 => "--:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;\n",
+
+            j5c35 => "--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;\n",
+            j5c40 => "--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;\n",
+            j5c45 => "--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;\n",
+
+            j5c46 => "--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero];\n",
+            j5c47 => "--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero];\n",
+
+            j5c60 => "20:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];\n",
+            j5c62 => "--:-:2:-:1  \@P1 LDG.E.CI.$vsizeI I02, [track0I2];\n",
+
+            j6c31 => "--:-:-:-:1      ISETP.GE.AND P0, PT, slice1I0, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P1, PT, slice1I1, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;\n",
+
+            j6c35 => "--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;\n",
+            j6c40 => "--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;\n",
+            j6c45 => "--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;\n",
+
+            j6c46 => "--:-:-:-:1 \@!P0 LDS.U.$vsizeI I10, [addr_zero];\n",
+            j6c47 => "--:-:-:-:1 \@!P1 LDS.U.$vsizeI I12, [addr_zero];\n",
+
+            j6c60 => "20:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I10, [track1I0];\n",
+            j6c62 => "--:3:2:-:1  \@P1 LDG.E.CI.$vsizeI I12, [track1I2];\n",
+
+        ) : $SN ? (
+            j5c31 => "--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I, RZ, P5;\n",
+            j5c45 => "--:-:-:-:1 \@!P2 LDS.U.$vsize I0, [addr_zero];\n",
+
+            j5c44 => "--:-:-:-:1  \@P5 IADD3 offsetIc0, offsetIc0, slice0I, n;\n",
+            j5c49 => "--:-:-:-:1  \@P5 LEA      track0I0.CC, offsetIc0, param_I[0],     $dshift;\n",
+            j5c54 => "--:-:-:-:1  \@P5 LEA.HI.X track0I1,    offsetIc0, param_I[1], RZ, $dshift;\n",
+            j5c60 => "20:-:2:-:1  \@P2 LDG.E.CI.$vsize I0, [track0I];\n",
+
+            j6c31 => "--:-:-:-:1      ISETP.GE.AND P2, PT, slice1I, RZ, P5;\n",
+            j6c45 => "--:-:-:-:1 \@!P2 LDS.U.$vsize I1, [addr_zero];\n",
+
+            j6c44 => "--:-:-:-:1  \@P5 IADD3 offsetIc1, offsetIc1, slice1I, n;\n",
+            j6c49 => "--:-:-:-:1  \@P5 LEA      track1I0.CC, offsetIc1, param_I[0],     $dshift;\n",
+            j6c54 => "--:-:-:-:1  \@P5 LEA.HI.X track1I1,    offsetIc1, param_I[1], RZ, $dshift;\n",
+            j6c60 => "20:3:2:-:1  \@P2 LDG.E.CI.$vsize I1, [track1I];\n",
+        ) : (),
+
+        j1c30 => "20:6:-:-:1  \@P6 STS.128 [writeFs + 4x<0*32>], F0;\n",
+        j2c30 => "20:6:-:-:1  \@P6 STS.128 [writeFs + 4x<1*32>], F1;\n",
+        j3c30 => "20:6:-:-:1  \@P6 STS.128 [writeFs + 4x<2*32>], F2;\n",
+        j4c30 => "20:6:-:-:1  \@P6 STS.128 [writeFs + 4x<3*32>], F3;\n",
+        j5c30 => "20:6:-:-:1  \@P6 STS.128 [writeIs + 4x<0*32>], I0;\n",
+        j6c30 => "20:6:-:-:1  \@P6 STS.128 [writeIs + 4x<1*32>], I1;\n",
+
+        $convert_in ? (
+            j1c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j2c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j3c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j4c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j5c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j6c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            $K1 ? (
+                j1c8  => "--:-:-:-:1  \@P6 $convert_in F00, F00;\n",
+                j1c10 => "--:-:-:-:1  \@P6 $convert_in F01, F01;\n",
+                j1c12 => "--:-:-:-:1  \@P6 $convert_in F02, F02;\n",
+                j1c14 => "--:-:6:-:1  \@P6 $convert_in F03, F03;\n",
+
+                j2c8  => "--:-:-:-:1  \@P6 $convert_in F10, F10;\n",
+                j2c10 => "--:-:-:-:1  \@P6 $convert_in F11, F11;\n",
+                j2c12 => "--:-:-:-:1  \@P6 $convert_in F12, F12;\n",
+                j2c14 => "--:-:6:-:1  \@P6 $convert_in F13, F13;\n",
+
+                j3c8  => "--:-:-:-:1  \@P6 $convert_in F20, F20;\n",
+                j3c10 => "--:-:-:-:1  \@P6 $convert_in F21, F21;\n",
+                j3c12 => "--:-:-:-:1  \@P6 $convert_in F22, F22;\n",
+                j3c14 => "--:-:6:-:1  \@P6 $convert_in F23, F23;\n",
+
+                j4c8  => "--:-:-:-:1  \@P6 $convert_in F30, F30;\n",
+                j4c10 => "--:-:-:-:1  \@P6 $convert_in F31, F31;\n",
+                j4c12 => "--:-:-:-:1  \@P6 $convert_in F32, F32;\n",
+                j4c14 => "--:-:6:-:1  \@P6 $convert_in F33, F33;\n",
+            ) : (
+                j1c8  => "--:-:-:-:1  \@P6 $convert_in F03, F01.H1;\n",
+                j1c10 => "--:-:-:-:1  \@P6 $convert_in F02, F01.H0;\n",
+                j1c12 => "--:-:-:-:1  \@P6 $convert_in F01, F00.H1;\n",
+                j1c14 => "--:-:6:-:1  \@P6 $convert_in F00, F00.H0;\n",
+
+                j2c8  => "--:-:-:-:1  \@P6 $convert_in F13, F11.H1;\n",
+                j2c10 => "--:-:-:-:1  \@P6 $convert_in F12, F11.H0;\n",
+                j2c12 => "--:-:-:-:1  \@P6 $convert_in F11, F10.H1;\n",
+                j2c14 => "--:-:6:-:1  \@P6 $convert_in F10, F10.H0;\n",
+
+                j3c8  => "--:-:-:-:1  \@P6 $convert_in F23, F21.H1;\n",
+                j3c10 => "--:-:-:-:1  \@P6 $convert_in F22, F21.H0;\n",
+                j3c12 => "--:-:-:-:1  \@P6 $convert_in F21, F20.H1;\n",
+                j3c14 => "--:-:6:-:1  \@P6 $convert_in F20, F20.H0;\n",
+
+                j4c8  => "--:-:-:-:1  \@P6 $convert_in F33, F31.H1;\n",
+                j4c10 => "--:-:-:-:1  \@P6 $convert_in F32, F31.H0;\n",
+                j4c12 => "--:-:-:-:1  \@P6 $convert_in F31, F30.H1;\n",
+                j4c14 => "--:-:6:-:1  \@P6 $convert_in F30, F30.H0;\n",
+            ),
+            $N1 ? (
+                j5c8  => "--:-:-:-:1  \@P6 $convert_in I03, I03;\n",
+                j5c10 => "--:-:-:-:1  \@P6 $convert_in I02, I02;\n",
+                j5c12 => "--:-:-:-:1  \@P6 $convert_in I01, I01;\n",
+                j5c14 => "--:-:6:-:1  \@P6 $convert_in I00, I00;\n",
+
+                j6c8  => "--:-:-:-:1  \@P6 $convert_in I13, I13;\n",
+                j6c10 => "--:-:-:-:1  \@P6 $convert_in I12, I12;\n",
+                j6c12 => "--:-:-:-:1  \@P6 $convert_in I11, I11;\n",
+                j6c14 => "--:-:6:-:1  \@P6 $convert_in I10, I10;\n",
+            ) : $N2 ? (
+                j5c8  => "--:-:-:-:1  \@P6 $convert_in I03, I02.H1;\n",
+                j5c10 => "--:-:-:-:1  \@P6 $convert_in I02, I02.H0;\n",
+                j5c12 => "--:-:-:-:1  \@P6 $convert_in I01, I00.H1;\n",
+                j5c14 => "--:-:6:-:1  \@P6 $convert_in I00, I00.H0;\n",
+
+                j6c8  => "--:-:-:-:1  \@P6 $convert_in I13, I12.H1;\n",
+                j6c10 => "--:-:-:-:1  \@P6 $convert_in I12, I12.H0;\n",
+                j6c12 => "--:-:-:-:1  \@P6 $convert_in I11, I10.H1;\n",
+                j6c14 => "--:-:6:-:1  \@P6 $convert_in I10, I10.H0;\n",
+            ) : (
+                j5c8  => "--:-:-:-:1  \@P6 $convert_in I03, I01.H1;\n",
+                j5c10 => "--:-:-:-:1  \@P6 $convert_in I02, I01.H0;\n",
+                j5c12 => "--:-:-:-:1  \@P6 $convert_in I01, I00.H1;\n",
+                j5c14 => "--:-:6:-:1  \@P6 $convert_in I00, I00.H0;\n",
+
+                j6c8  => "--:-:-:-:1  \@P6 $convert_in I13, I11.H1;\n",
+                j6c10 => "--:-:-:-:1  \@P6 $convert_in I12, I11.H0;\n",
+                j6c12 => "--:-:-:-:1  \@P6 $convert_in I11, I10.H1;\n",
+                j6c14 => "--:-:6:-:1  \@P6 $convert_in I10, I10.H0;\n",
+            ),
+        ) : (
+            j1c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j2c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j3c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j4c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j5c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j6c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+        ),
+
+        $K1 ? (
+            j1c31 => "--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;\n",
+            j1c32 => "--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;\n",
+            j1c33 => "--:-:-:-:1  \@P5 SHF.R.U64 preds, preds, 4, preds;\n",
+            j1c56 => "20:-:-:-:1  \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>];\n",
+            j1c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>];\n",
+            j1c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>];\n",
+            j1c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>];\n",
+
+            j2c31 => "--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;\n",
+            j2c32 => "--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;\n",
+            j2c33 => "--:-:-:-:1  \@P5 SHF.L.U64 preds, preds, 4, preds;\n",
+            j2c56 => "20:-:-:-:1  \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>];\n",
+            j2c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>];\n",
+            j2c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>];\n",
+            j2c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>];\n",
+
+            j3c31 => "--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;\n",
+            j3c32 => "--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;\n",
+            j3c33 => "--:-:-:-:1  \@P5 SHF.R.U64 preds, preds, 4, preds;\n",
+            j3c56 => "20:-:-:-:1  \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>];\n",
+            j3c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>];\n",
+            j3c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>];\n",
+            j3c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>];\n",
+
+            j4c31 => "--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;\n",
+            j4c32 => "--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;\n",
+            j4c33 => "--:-:-:-:1  \@P5 SHF.L.U64 preds, preds, 4, preds;\n",
+            j4c56 => "20:-:-:-:1  \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>];\n",
+            j4c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>];\n",
+            j4c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>];\n",
+            j4c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>];\n",
+
+        ) : (
+            j0c52 => "--:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K,    P5;\n",
+            j0c53 => "--:-:-:-:1      ISETP.LT.AND P1, PT, k, param_Km32, P5;\n",
+
+            j1c60 => "20:-:2:-:1  \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>];\n",
+            j2c60 => "20:-:2:-:1  \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>];\n",
+            j3c60 => "20:-:2:-:1  \@P0 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>];\n",
+            j4c60 => "20:-:2:-:1  \@P1 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>];\n",
+        ),
+
+        j6c63 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readFs,  readFs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readIs,  readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeFs, writeFs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeIs, writeIs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P6 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P6' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+
+--:-:-:-:1      ISETP.EQ.AND P4, PT, RZ, param_flags, PT;
+
+--:-:-:-:1      LOP.AND tid_31, tid, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid, 5;
+
+// readFs = (tid_32 << 7 + tid_31) << 2
+--:-:-:-:1      ISCADD readOs, tid_32, tid_31, 7;
+--:-:-:-:1      SHL    readOs, readOs, 2;
+
+--:-:-:-:1      ISETP.EQ.AND P6, PT, tid_31, RZ, PT;
+
+// k = idx_K*64 + tid_32
+--:-:-:-:1      ISCADD  k00, idx_K, tid_32, 6;
+--:-:-:-:1      IADD    k04, k00, 4;
+--:-:-:-:1      IADD    k08, k00, 8;
+--:-:-:-:1      IADD    k12, k00, 12;
+
+[+
+    our $bsum; return $bsum ? q{
+--:-:-:-:1      XMAD      bsum_offset, idx_Q, param_gridN,   idx_N;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_P, param_gridQN,  bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_M, param_gridPQN, bsum_offset;
+    } : '';
++]
+
+[+
+    our $LN; return $LN ? q{
+// n = idx_N*32 + tid31;
+--:-:-:-:1      ISCADD N, idx_N, tid_31, 5;
+// n < N
+--:-:-:-:1      ISETP.LT.AND P4, PT, N, param_N, P4;
+
+// o = k*MPQN + m*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD      offset, idx_Q, param_N,   N;
+--:-:-:-:1      XMAD.LO2C offset, idx_P, param_QN,  offset;
+--:-:-:-:1      XMAD.LO2C offset, idx_M, param_PQN, offset;
+
+    } : q{
+
+--:-:-:-:1      SHL M, idx_M, param_shiftM;
+--:-:-:-:1      SHL P, idx_P, param_shiftP;
+--:-:-:-:1      SHL Q, idx_Q, param_shiftQ;
+--:-:-:-:1      SHL N, idx_N, param_shiftN;
+
+--:-:-:-:1      BFE.U32 super_M, tid_31, param_SuperM;
+--:-:-:-:1      BFE.U32 super_P, tid_31, param_SuperP;
+--:-:-:-:1      BFE.U32 super_Q, tid_31, param_SuperQ;
+--:-:-:-:1      LOP.AND super_N, tid_31, param_SuperN;
+
+--:-:-:-:1      IADD M, M, super_M;
+--:-:-:-:1      IADD P, P, super_P;
+--:-:-:-:1      IADD Q, Q, super_Q;
+--:-:-:-:1      IADD N, N, super_N;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, M, param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, P, param_P, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, Q, param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, N, param_N, P0;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P2;
+
+// o = k*MPQN + m*PQN + p*QN + q*N + N
+--:-:-:-:1      XMAD      offset, Q, param_N,   N;
+--:-:-:-:1      XMAD.LO2C offset, P, param_QN,  offset;
+--:-:-:-:1      XMAD.LO2C offset, M, param_PQN, offset;
+    };
++]
+--:-:-:-:1      XMAD.LO2C offset, k00, param_MPQN, offset;
+
+--:-:-:-:1      MOV MPQN16, param_MPQN;
+--:-:-:-:1      SHL MPQN4,  MPQN16, [+ dshift()+2 +];
+--:-:-:-:1      SHL MPQN16, MPQN16, 4;
+
+--:-:-:-:1      MOV32I one, 1.0;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:1      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:1      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y1;
+--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y1;
+--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y2;
+--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y2;
+--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y3;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_O;
+--:-:-:-:0      IADD readOs, readOs, 4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_O;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:0      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:1      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:1      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:1      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:1      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:1      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:1      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y4;
+--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y4;
+--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y5;
+--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y5;
+--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y6;
+--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y6;
+--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y7;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:0      IADD readOs, readOs, -4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_O;
+--:-:-:-:0      IADD readOs, readOs,  4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_O;
+
+--:-:-:-:5      EXIT;
+
+STORE_O:
+
+--:-:-:-:2      ISETP.LT.AND P0, PT, k00, param_K, P4; // k00 < K && n < N
+--:-:-:-:2      ISETP.LT.AND P1, PT, k04, param_K, P4; // k04 < K && n < N
+--:-:-:-:2      ISETP.LT.AND P2, PT, k08, param_K, P4; // k08 < K && n < N
+--:-:-:-:1      ISETP.LT.AND P3, PT, k12, param_K, P4; // k12 < K && n < N
+[+
+    our ($beta, $brelu, $bprelu, $dshift, $dtype);
+    return $beta || $brelu || $bprelu ? qq{
+<SCHEDULE_BLOCK>
+01:-:-:-:1      LEA      Out00_0.CC, offset, param_X[0],     $dshift;
+--:-:-:-:1      LEA.HI.X Out00_1,    offset, param_X[1], RZ, $dshift;
+--:-:-:-:1      IADD     Out04_0.CC, Out00_0, MPQN4;
+--:-:-:-:1      IADD.X   Out04_1,    Out00_1, RZ;
+--:-:-:-:1      IADD     Out08_0.CC, Out04_0, MPQN4;
+--:-:-:-:1      IADD.X   Out08_1,    Out04_1, RZ;
+--:-:-:-:1      IADD     Out12_0.CC, Out08_0, MPQN4;
+--:-:-:-:1      IADD.X   Out12_1,    Out08_1, RZ;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype b00, [Out00_0];
+--:-:-:-:1 \@!P0 MOV b00, RZ;
+--:-:5:-:1  \@P1 LDG.E.CI.$dtype b04, [Out04_0];
+--:-:-:-:1 \@!P1 MOV b04, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype b08, [Out08_0];
+--:-:-:-:1 \@!P2 MOV b08, RZ;
+--:-:6:-:1  \@P3 LDG.E.CI.$dtype b12, [Out12_0];
+--:-:-:-:1 \@!P3 MOV b12, RZ;
+</ORDERED>
+</SCHEDULE_BLOCK>
+    } : '';
++]
+[+
+    our $bias;
+    return $bias ? q{
+<SCHEDULE_BLOCK>
+20:-:-:-:1      LEA      Sum00_0.CC, k00, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum00_1,    k00, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum04_0.CC, k04, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum04_1,    k04, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum08_0.CC, k08, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum08_1,    k08, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum12_0.CC, k12, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum12_1,    k12, param_Sum[1], RZ, 2;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI b00, [Sum00_0];
+--:-:-:-:1 @!P0 MOV b00, RZ;
+--:-:5:-:1  @P1 LDG.E.CI b04, [Sum04_0];
+--:-:-:-:1 @!P1 MOV b04, RZ;
+--:-:-:-:1  @P2 LDG.E.CI b08, [Sum08_0];
+--:-:-:-:1 @!P2 MOV b08, RZ;
+--:-:6:-:1  @P3 LDG.E.CI b12, [Sum12_0];
+--:-:-:-:1 @!P3 MOV b12, RZ;
+</ORDERED>
+</SCHEDULE_BLOCK>
+    } : '';
++]
+--:-:-:-:1      LDS o00_0, [readOs + 4x< 0*128 + 0*32 + 0*16>];
+--:-:-:-:1      LDS o00_1, [readOs + 4x< 0*128 + 1*32 + 0*16>];
+--:-:-:-:1      LDS o00_2, [readOs + 4x< 0*128 + 2*32 + 0*16>];
+--:-:1:Y:1      LDS o00_3, [readOs + 4x< 0*128 + 3*32 + 0*16>];
+--:-:-:-:1      LDS o04_0, [readOs + 4x< 4*128 + 0*32 + 1*16>];
+--:-:-:-:1      LDS o04_1, [readOs + 4x< 4*128 + 1*32 + 1*16>];
+--:-:-:-:1      LDS o04_2, [readOs + 4x< 4*128 + 2*32 + 1*16>];
+--:-:2:Y:1      LDS o04_3, [readOs + 4x< 4*128 + 3*32 + 1*16>];
+--:-:-:-:1      LDS o08_0, [readOs + 4x< 8*128 + 0*32 + 2*16>];
+--:-:-:-:1      LDS o08_1, [readOs + 4x< 8*128 + 1*32 + 2*16>];
+--:-:-:-:1      LDS o08_2, [readOs + 4x< 8*128 + 2*32 + 2*16>];
+--:-:3:Y:1      LDS o08_3, [readOs + 4x< 8*128 + 3*32 + 2*16>];
+--:-:-:-:1      LDS o12_0, [readOs + 4x<12*128 + 0*32 + 3*16>];
+--:-:-:-:1      LDS o12_1, [readOs + 4x<12*128 + 1*32 + 3*16>];
+--:-:-:-:1      LDS o12_2, [readOs + 4x<12*128 + 2*32 + 3*16>];
+--:-:4:Y:1      LDS o12_3, [readOs + 4x<12*128 + 3*32 + 3*16>];
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      FADD o00_0, o00_0, o00_1;
+--:-:-:-:1      FADD o00_2, o00_2, o00_3;
+02:-:-:-:1      FADD o04_0, o04_0, o04_1;
+--:-:-:-:1      FADD o04_2, o04_2, o04_3;
+04:-:-:-:1      FADD o08_0, o08_0, o08_1;
+--:-:-:-:1      FADD o08_2, o08_2, o08_3;
+08:-:-:-:1      FADD o12_0, o12_0, o12_1;
+--:-:-:-:1      FADD o12_2, o12_2, o12_3;
+
+--:-:-:-:1      FADD out00, o00_0, o00_2;
+--:-:-:-:1      FADD out04, o04_0, o04_2;
+--:-:-:-:1      FADD out08, o08_0, o08_2;
+--:-:-:-:3      FADD out12, o12_0, o12_2;
+[+
+    our $bias; return $bias ? q{
+10:-:-:-:1      FADD out00, out00, b00;
+--:-:-:-:1      FADD out04, out04, b04;
+20:-:-:-:1      FADD out08, out08, b08;
+--:-:-:-:1      FADD out12, out12, b12;
+    } : '';
++]
+[+
+    our $relu; return $relu ? q{
+// maximum(x, 0)
+--:-:-:-:1      FMNMX out00, out00, RZ, !PT;
+--:-:-:-:1      FMNMX out04, out04, RZ, !PT;
+--:-:-:-:1      FMNMX out08, out08, RZ, !PT;
+--:-:-:-:1      FMNMX out12, out12, RZ, !PT;
+    } : '';
++]
+[+
+    our $prelu; return $prelu ? q{
+// maximum(x, 0) + slope * minimum(0, x)
+--:-:-:-:1      FMNMX b00, out00, RZ, !PT;
+--:-:-:-:1      FMNMX b04, out04, RZ, !PT;
+--:-:-:-:1      FMNMX b08, out08, RZ, !PT;
+--:-:-:-:1      FMNMX b12, out12, RZ, !PT;
+
+--:-:-:-:1      FMNMX x00, out00, RZ, PT;
+--:-:-:-:1      FMNMX x04, out04, RZ, PT;
+--:-:-:-:1      FMNMX x08, out08, RZ, PT;
+--:-:-:-:1      FMNMX x12, out12, RZ, PT;
+
+--:-:-:-:1      FFMA out00, x00, param_beta, b00;
+--:-:-:-:1      FFMA out04, x04, param_beta, b04;
+--:-:-:-:1      FFMA out08, x08, param_beta, b08;
+--:-:-:-:1      FFMA out12, x12, param_beta, b12;
+    } : '';
++]
+</SCHEDULE_BLOCK>
+<SCHEDULE_BLOCK>
+[+
+    our ($beta, $brelu, $bprelu, $convert_in);
+    return $convert_in && ($beta || $brelu || $bprelu) ? qq{
+10:-:1:-:1  \@P0 $convert_in b00, b00;
+--:-:2:-:1  \@P1 $convert_in b04, b04;
+20:-:3:-:1  \@P2 $convert_in b08, b08;
+--:-:4:-:1  \@P3 $convert_in b12, b12;
+    } : '';
++]
+[+
+    our $beta; return $beta ? q{
+11:-:-:-:1      FFMA out00, b00, param_beta, out00;
+02:-:-:-:1      FFMA out04, b04, param_beta, out04;
+24:-:-:-:1      FFMA out08, b08, param_beta, out08;
+08:-:-:-:1      FFMA out12, b12, param_beta, out12;
+    } : '';
++]
+[+
+    our $brelu; return $brelu ? q{
+//delta *= x > 0
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
+02:-:-:-:1      FSETP.GT.AND P1, PT, b04, RZ, PT;
+24:-:-:-:1      FSETP.GT.AND P2, PT, b08, RZ, PT;
+08:-:-:-:1      FSETP.GT.AND P3, PT, b12, RZ, PT;
+--:-:-:-:1 @!P0 MOV out00, RZ;
+--:-:-:-:1 @!P1 MOV out04, RZ;
+--:-:-:-:1 @!P2 MOV out08, RZ;
+--:-:-:-:1 @!P3 MOV out12, RZ;
+--:-:-:Y:d      R2P PR, preds, 0x0f;
+
+    } : '';
++]
+[+
+    our $bprelu; return $bprelu ? q{
+//delta *= ((x > 0) + slope * (x < 0))
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
+02:-:-:-:1      FSETP.GT.AND P1, PT, b04, RZ, PT;
+24:-:-:-:1      FSETP.GT.AND P2, PT, b08, RZ, PT;
+08:-:-:-:1      FSETP.GT.AND P3, PT, b12, RZ, PT;
+--:-:-:-:1      SEL x00, one, RZ, P0;
+--:-:-:-:1      SEL x04, one, RZ, P1;
+--:-:-:-:1      SEL x08, one, RZ, P2;
+--:-:-:-:1      SEL x12, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b00, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b04, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b08, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b12, RZ, PT;
+--:-:-:-:1      SEL b00, one, RZ, P0;
+--:-:-:-:1      SEL b04, one, RZ, P1;
+--:-:-:-:1      SEL b08, one, RZ, P2;
+--:-:-:-:1      SEL b12, one, RZ, P3;
+--:-:-:-:1      R2P PR, preds, 0x0f;
+--:-:-:-:1      FFMA b00, b00, param_beta, x00;
+--:-:-:-:1      FFMA b04, b04, param_beta, x04;
+--:-:-:-:1      FFMA b08, b08, param_beta, x08;
+--:-:-:-:1      FFMA b12, b12, param_beta, x12;
+--:-:-:-:1      FMUL out00, out00, b00;
+--:-:-:-:1      FMUL out04, out04, b04;
+--:-:-:-:1      FMUL out08, out08, b08;
+--:-:-:-:2      FMUL out12, out12, b12;
+    } : '';
++]
+[+
+    our $bsum; return $bsum ? q{
+20:-:-:-:1      SEL sum00, out00, RZ, P0;
+--:-:-:-:1      SEL sum04, out04, RZ, P1;
+--:-:-:-:1      SEL sum08, out08, RZ, P2;
+--:-:-:-:1      SEL sum12, out12, RZ, P3;
+    } : '';
++]
+</SCHEDULE_BLOCK>
+[+
+    our $convert_out; return $convert_out ? qq{
+--:-:1:-:1  \@P0 $convert_out out00, out00;
+--:-:2:-:1  \@P1 $convert_out out04, out04;
+--:-:3:-:1  \@P2 $convert_out out08, out08;
+--:-:4:-:1  \@P3 $convert_out out12, out12;
+    } : '';
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out00_0.CC, offset, param_O[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X Out00_1,    offset, param_O[1], RZ, [+ dshift() +];
+--:-:-:-:1      IADD     Out04_0.CC, Out00_0, MPQN4;
+--:-:-:-:1      IADD.X   Out04_1,    Out00_1, RZ;
+--:-:-:-:1      IADD     Out08_0.CC, Out04_0, MPQN4;
+--:-:-:-:1      IADD.X   Out08_1,    Out04_1, RZ;
+--:-:-:-:1      IADD     Out12_0.CC, Out08_0, MPQN4;
+--:-:-:-:1      IADD.X   Out12_1,    Out08_1, RZ;
+
+01:-:-:-:1  @P0 STG.E.CG.[+ dtype() +] [Out00_0], out00;
+02:-:-:-:1  @P1 STG.E.CG.[+ dtype() +] [Out04_0], out04;
+04:-:-:-:1  @P2 STG.E.CG.[+ dtype() +] [Out08_0], out08;
+08:1:-:-:1  @P3 STG.E.CG.[+ dtype() +] [Out12_0], out12;
+</SCHEDULE_BLOCK>
+
+[+
+    our $bsum; return $bsum ? q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      XMAD.LO2C bsum00, k00, param_gridMPQN, bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum04, k04, param_gridMPQN, bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum08, k08, param_gridMPQN, bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum12, k12, param_gridMPQN, bsum_offset;
+--:-:-:-:1      LEA      Sum00_0.CC, bsum00, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum00_1,    bsum00, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum04_0.CC, bsum04, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum04_1,    bsum04, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum08_0.CC, bsum08, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum08_1,    bsum08, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum12_0.CC, bsum12, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum12_1,    bsum12, param_Sum[1], RZ, 2;
+--:-:-:-:1      ISETP.LT.AND P0, PT, k00, param_K, P6; // k00 < K && tid31 == 0
+--:-:-:-:1      ISETP.LT.AND P1, PT, k04, param_K, P6; // k04 < K && tid31 == 0
+--:-:-:-:1      ISETP.LT.AND P2, PT, k08, param_K, P6; // k08 < K && tid31 == 0
+--:-:-:-:1      ISETP.LT.AND P3, PT, k12, param_K, P6; // k12 < K && tid31 == 0
+<ORDERED>
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00,  1, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04,  1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08,  1, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12,  1, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:1      FADD   sum12, x12, sum12;
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00,  2, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04,  2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08,  2, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12,  2, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:1      FADD   sum12, x12, sum12;
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00,  4, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04,  4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08,  4, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12,  4, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:1      FADD   sum12, x12, sum12;
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00,  8, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04,  8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08,  8, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12,  8, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:1      FADD   sum12, x12, sum12;
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00, 16, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04, 16, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08, 16, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12, 16, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:0      FADD   sum12, x12, sum12;
+</ORDERED>
+</SCHEDULE_BLOCK>
+--:-:-:-:1  @P0 STG.E.CG [Sum00_0], sum00;
+--:-:-:-:1  @P1 STG.E.CG [Sum04_0], sum04;
+--:-:-:-:1  @P2 STG.E.CG [Sum08_0], sum08;
+--:6:-:-:1  @P3 STG.E.CG [Sum12_0], sum12;
+    } : '';
++]
+
+--:-:-:-:1      IADD k00, k00, 16;
+--:-:-:-:1      IADD k04, k04, 16;
+--:-:-:-:1      IADD k08, k08, 16;
+--:-:-:-:1      IADD k12, k12, 16;
+--:-:-:-:0      IADD offset, offset, MPQN16;
+
+--:-:-:-:5      RET;
\ No newline at end of file
diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_2x2_3x3_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_2x2_3x3_32x32.sass
new file mode 100644
index 0000000..a8a1ef4
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_winograd_2x2_3x3_32x32.sass
@@ -0,0 +1,1568 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $type;
+our $dtype  = $type eq 'h' ?  'U16' :  '32';
+our $dshift = $type eq 'h' ?    '1' :   '2';
+our $dsize  = $type eq 'h' ?    '2' :   '4';
+our $vsize  = $type eq 'h' ?   '64' : '128';
+
+sub dtype  { return $dtype;  }
+sub dsize  { return $dsize;  }
+sub dshift { return $dshift; }
+
+our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' : '';
+our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : '';
+-]
+
+<CONSTANT_MAPPING>
+    param_S[0]         : c[0x0][0x140]
+    param_S[1]         : c[0x0][0x144]
+    param_X[0]         : c[0x0][0x148]
+    param_X[1]         : c[0x0][0x14c]
+    param_O[0]         : c[0x0][0x150]
+    param_O[1]         : c[0x0][0x154]
+    param_I[0]         : c[0x0][0x158]
+    param_I[1]         : c[0x0][0x15c]
+    param_F[0]         : c[0x0][0x160]
+    param_F[1]         : c[0x0][0x164]
+    param_alpha        : c[0x0][0x168]
+    param_beta         : c[0x0][0x16c]
+    param_flags        : c[0x0][0x170]
+    param_C            : c[0x0][0x174]
+    param_H            : c[0x0][0x178]
+    param_P            : c[0x0][0x17c]
+    param_pad_h        : c[0x0][0x180]
+    param_pad_w        : c[0x0][0x184]
+    param_HWN          : c[0x0][0x188]
+    param_WN           : c[0x0][0x18c]
+    param_PQN          : c[0x0][0x190]
+    param_QN           : c[0x0][0x194]
+    param_Qnk          : c[0x0][0x198]
+    param_nk           : c[0x0][0x19c]
+    param_n            : c[0x0][0x1a0]
+    param_k            : c[0x0][0x1a4]
+    param_magic_Qnk    : c[0x0][0x1a8]
+    param_shift_Qnk    : c[0x0][0x1ac]
+    param_magic_nk     : c[0x0][0x1b0]
+    param_shift_nk     : c[0x0][0x1b4]
+    param_magic_k      : c[0x0][0x1b8]
+    param_shift_k      : c[0x0][0x1bc]
+    param_RSK          : c[0x0][0x1c0]
+    param_4RSKp        : c[0x0][0x1c4]
+    param_4HWNp        : c[0x0][0x1c8]
+    param_gridK        : c[0x0][0x1cc]
+    param_gridP2       : c[0x0][0x1d0]
+    param_gridQ        : c[0x0][0x1d4]
+    param_gridN        : c[0x0][0x1d8]
+    param_gridQN       : c[0x0][0x1dc]
+    param_gridPQN      : c[0x0][0x1e0]
+    param_superP       : c[0x0][0x1e4]
+    param_superQ       : c[0x0][0x1e8]
+    param_superN       : c[0x0][0x1ec]
+    param_shiftP       : c[0x0][0x1f0]
+    param_shiftQ       : c[0x0][0x1f4]
+    param_shiftN       : c[0x0][0x1f8]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      64-79 ~ tid, idx_P, idx_Q, idx_N, idx_K, idx_n, idx_k, tid16, tid31, c, addr_zero, partialC
+     80-119 ~ tid1, idx_PQnk, idx_Qnk, idx_nk, magic_Qnk, neg_Qnk, neg_nk, neg_k, div<1-3>, idx_P2, idx_Q2, z<1-2>, negOne, super_P, super_Q
+      80-95 ~ super_N, y, x, ti, ti_sign, x<1-3>, mask_x, preds1, offsetIC
+      80-95 ~ tf, tid31_4, offsetFC
+
+    120-121 : track<0-1>
+    122-127 ~ writeS, readFs, readIs, C, preds, idx_nkpq
+
+      80-95 ~ p, q, n, tid32, tid64, tid_16, tid_1, q2, p2, to, superP, superQ, superN
+      96-99 : Out<0-1>, Sum<0-1>
+    100-121 ~ alpha, one, writeCs, readCs, k, PQN15, tid_31, out_offset, bsum_offset
+
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+
+      64-79 : m0<0-3>, m1<0-3>, m2<0-3>, m3<0-3>
+      80-95 : t0<0-1>, t1<0-1>, t2<0-1>, t3<0-1>
+
+    3,2,11,10,19,18,27,26,1,0,9,8,17,16,25,24 ~ b<00|01|10|11>, x<00|01|10|11>, sum<0|1>, s0<0-1>, s1<0-1>
+
+        // Image registers (registers assigned to avoid bank conflicts)
+         96 = i00
+         97 = i01
+         98 = i02
+         99 = i03
+        100 = i30
+        101 = i31
+        102 = i32
+        103 = i33
+        105 = i13
+        104 = i12
+        107 = i11
+        106 = i10
+        108 = i23, TI23, I23
+        109 = i22, TI22
+        110 = i21, TI21
+        111 = i20, TI20, I20
+        113 = TI00, I00, TI10, I10, I21, I01
+        112 = TI01, I11
+        115 = TI02, I12
+        114 = TI03, I03, TI11, I31
+        116 = TI30, I30, TI12, I32
+        117 = TI31
+        118 = TI32
+        119 = TI33, I33, TI13, I13, I22, I02
+    // Filter registers
+[+
+    our $FX;
+    return $FX ? q{
+    104-119 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>
+    } : q{
+         96 = f00, TF00, F00
+         97 = f01, TF01
+         98 = f02, TF02, F03
+         99 = f10
+        100 = f11
+        101 = f12
+        102 = f20, TF30, F30
+        103 = f21, TF31
+        104 = f22, TF32, F33
+        105 = tb3, F32
+        106 = tb0, F02
+        107 = ta2, TF22, F23
+        108 = ta0, TF20, F20
+        109 = ta1, TF21
+        110 = F01
+        111 = F31
+        112 = TF10, F10
+        113 = TF11
+        114 = TF12, F13
+        115 = tb1, F12
+        116 = tb2, F22
+        117 = F11
+        118 = F21
+    };
++]
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_PQnk, SR_CTAID.X;
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 128, PT;
+
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+
+// c = (tid & 127) / 32
+--:-:-:-:1      BFE.U32 c, tid, 0x205; // 2 bits at position 5
+
+--:-:-:-:1      SHL addr_zero, tid31, 4;
+--:-:-:-:1      ISCADD addr_zero, c, addr_zero, 11;
+--:-:-:-:1  @P0 IADD addr_zero, addr_zero, 4x<512*4>;
+
+--:-:-:-:1      STS.128 [addr_zero + 4x<00*4>], RZ;
+--:-:-:-:1      STS.128 [addr_zero + 4x<32*4>], RZ;
+--:-:-:-:1      STS.128 [addr_zero + 4x<64*4>], RZ;
+--:-:-:-:1      STS.128 [addr_zero + 4x<96*4>], RZ;
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+
+// idx_P2 = idx_PQnk / blk_Qnk
+--:-:-:-:1      MOV  magic_Qnk, param_magic_Qnk;
+--:-:-:-:1      ISETP.NE.AND P1, PT,  magic_Qnk, 1, PT;
+02:-:-:-:1  @P1 XMAD     div1, idx_PQnk,    magic_Qnk,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, idx_PQnk,    magic_Qnk.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, idx_PQnk.H1, magic_Qnk.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, idx_PQnk.H1, magic_Qnk,    div1;
+--:-:-:-:1  @P1 IADD3.RS idx_P2, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  idx_P2, idx_P2,   param_shift_Qnk;
+--:-:-:-:1 @!P1 SHR.U32  idx_P2, idx_PQnk, param_shift_Qnk;
+
+// idx_Qnk = idx_PQnk % blk_Qnk
+--:-:-:-:1      IADD neg_Qnk, RZ, -param_Qnk;
+--:-:-:-:1      XMAD.LO2 idx_Qnk, neg_Qnk, idx_P2, idx_PQnk;
+
+// idx_Q2  = idx_Qnk / nk
+--:-:-:-:1      XMAD.LO2C idx_Q2, idx_Qnk, param_magic_nk, RZ;
+--:-:-:-:1      SHR.U32   idx_Q2, idx_Q2,  param_shift_nk;
+// idx_nk = idx_Qnk % nk
+--:-:-:-:1      IADD neg_nk, RZ, -param_nk;
+--:-:-:-:1      XMAD.S16.U16  idx_nk, neg_nk, idx_Q2, idx_Qnk;
+
+// idx_n = idx_nk / k
+--:-:-:-:1      XMAD    idx_n,  idx_nk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_n,  idx_n,  param_shift_k;
+// idx_k = idx_nk % k
+--:-:-:-:1      IADD neg_k, RZ, -param_k;
+--:-:-:-:1      XMAD.S16.U16 idx_k, neg_k, idx_n, idx_nk;
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// idx_P = idx_P2 * 2
+// idx_Q = idx_Q2
+// if idx_P2 != gridP2:
+//     idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1)
+//     idx_Q  = idx_Q2 >> 1
+--:-:-:-:1      ISETP.NE.AND P1, PT, idx_P2, param_gridP2, PT;
+--:-:-:-:1      SHL idx_P, idx_P2, 1;
+--:-:-:-:1  @P1 LOP.AND z1, idx_Q2, 1;
+--:-:-:-:1  @P1 BFE.U32 z2, idx_Q2, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P1 LOP.XOR z1, z1, z2;
+--:-:-:-:1  @P1 IADD idx_P, idx_P, z1;
+--:-:-:-:1  @P1 SHR.U32 idx_Q, idx_Q2, 1;
+--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2;
+
+// Scan backwards on odd rows
+// if idx_P2 & 1:
+//     idx_Q = gridQ - idx_Q - 1
+--:-:-:-:1      LOP.AND.NZ P2, RZ, idx_P2, 1;
+--:-:-:-:1      MOV negOne, -1;
+--:-:-:-:1  @P2 IADD3 idx_Q, -idx_Q, param_gridQ, negOne;
+
+--:-:-:-:1      BFI idx_nkpq, idx_P, 0x0c0c, idx_Q;
+--:-:-:-:1      BFI idx_nkpq, idx_k, 0x0418, idx_nkpq;
+--:-:-:-:1      BFI idx_nkpq, idx_n, 0x041c, idx_nkpq;
+
+// x = grid_x << shiftX
+// y = grid_y << shiftY
+--:-:-:-:1      SHL idx_P, idx_P, param_shiftP;
+--:-:-:-:1      SHL idx_Q, idx_Q, param_shiftQ;
+
+// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
+--:-:-:-:1      BFE.U32 super_P, tid, param_superP;
+--:-:-:-:1      BFE.U32 super_Q, tid, param_superQ;
+--:-:-:-:1      ISCADD idx_P, super_P,  idx_P, 1;
+--:-:-:-:1      ISCADD idx_Q, super_Q,  idx_Q, 1;
+
+// If this value is not a multiple of 4 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 4 then make a full 4 line fetch.
+--:-:-:-:1      MOV C, param_C;
+--:-:-:-:1      LOP.AND.Z P6, partialC, C, 3;
+--:-:-:-:1 @!P6 IADD3 C, C, 4, -partialC;
+--:-:-:-:1  @P6 MOV partialC, 4;
+// P6 = c < partialC
+--:-:-:-:1      ISETP.LT.AND P6, PT, c, partialC, PT;
+
+[+
+    our $FX; return $FX ? '' : q{
+// writeS = c*512 + tid & 31
+--:-:-:-:1      ISCADD writeS, c, tid31, 9;
+--:-:-:-:1      ISCADD writeS, writeS, 4x<512*4*2>, 2;
+    }
++]
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid,   -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,  1;
+
+--:-:-:-:1      BFE.U32  readIs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
+--:-:-:-:1      SHL      readIs, readIs, 4;
+
+--:-:-:-:1      LOP.AND  tid1,   tid,    1;
+--:-:-:-:1      LOP.AND  readFs, tid,    8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      LOP3.LUT readFs, readFs, tid16, tid1, 0xfe;
+--:-:-:-:1      ISCADD   readFs, readFs, 4x<512*4>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U FILTER_SETUP;
+
+--:-:1:-:2      S2R idx_N, SR_CTAID.Z;
+
+
+<SCHEDULE_BLOCK>
+
+// writeS = c*512 + tid & 31
+[+
+    our $FX;
+    return $FX ? q{
+--:-:-:-:1      ISCADD writeS, c, tid31, 9;
+--:-:-:-:1      ISCADD writeS, writeS, 4x<512*4*2>, 2;
+    } : '';
++]
+
+--:-:-:-:1      LOP.AND super_N, tid, param_superN;
+
+01:-:-:-:1      XMAD idx_N, idx_N, param_n, idx_n;
+--:-:-:-:1      SHL  idx_N, idx_N, param_shiftN;
+--:-:-:-:1      IADD idx_N, idx_N, super_N;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P5, PT, idx_N, 1x<$N>, PT;
+
+// Subtract off the padding
+--:-:-:-:1      IADD y, idx_P, -param_pad_h;
+--:-:-:-:1      IADD x, idx_Q, -param_pad_w;
+
+// a0 = n + x*N + y*XN + c*YXN
+--:-:-:-:1      XMAD.S16.U16      ti, x,  1x<$N>,    idx_N;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti, y,  param_WN,  ti;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti, c,  param_HWN, ti;
+--:-:-:-:1      ISET.LT.AND ti_sign, ti, RZ, PT;
+--:-:-:-:1      LEA    track0.CC, ti,      param_I[0], [+ dshift() +];
+--:-:-:-:1      IADD.X track1,    ti_sign, param_I[1];
+
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      IADD x2, x, 2;
+--:-:-:-:1      IADD x3, x, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  1x<$W>, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, 1x<$W>, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, 1x<$W>, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, 1x<$W>, PT;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD x1, y, 1;
+--:-:-:-:1      IADD x2, y, 2;
+--:-:-:-:1      IADD x3, y, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, y,  param_H, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_H, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_H, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_H, P5;
+--:-:-:-:1      ISETP.GE.AND P0, PT, y,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+
+--:-:-:-:1      SEL preds, mask_x, RZ, P0;
+--:-:-:-:1  @P1 BFI preds, mask_x, 0x404, preds;
+--:-:-:-:1  @P2 BFI preds, mask_x, 0x408, preds;
+--:-:-:-:1  @P3 BFI preds, mask_x, 0x40c, preds;
+
+// For partial C on first load
+--:-:-:-:1      SEL preds1, preds, RZ, P6;
+
+// offsetIC = partialC*YXN
+--:-:-:-:1      XMAD.LO2C offsetIC, partialC, param_HWN, RZ;
+
+--:-:-:-:1      R2P PR, preds1, 0x0f;
+--:-:-:-:1      SHF.R.U64 preds1, preds1, 12, preds1;
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>];
+--:-:-:-:1 @!P0 MOV i00, RZ;
+--:-:-:-:1 @!P1 MOV i01, RZ;
+--:-:-:-:1 @!P2 MOV i02, RZ;
+--:-:-:-:1 @!P3 MOV i03, RZ;
+
+--:-:-:-:1      R2P PR, preds1, 0x0f;
+--:-:-:-:1      SHF.L.U64 preds1, preds1, 8, preds1;
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>];
+--:-:-:-:1 @!P0 MOV i30, RZ;
+--:-:-:-:1 @!P1 MOV i31, RZ;
+--:-:-:-:1 @!P2 MOV i32, RZ;
+--:-:-:-:1 @!P3 MOV i33, RZ;
+
+--:-:-:-:1      R2P PR, preds1, 0x0f;
+--:-:-:-:1      SHF.R.U64 preds1, preds1, 4, preds1;
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>];
+--:-:-:-:1 @!P0 MOV i10, RZ;
+--:-:-:-:1 @!P1 MOV i11, RZ;
+--:-:-:-:1 @!P2 MOV i12, RZ;
+--:-:-:-:1 @!P3 MOV i13, RZ;
+
+--:-:-:-:1      R2P PR, preds1, 0x0f;
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>];
+--:6:2:-:1  @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>];
+--:-:-:-:1 @!P0 MOV i20, RZ;
+--:-:-:-:1 @!P1 MOV i21, RZ;
+--:-:-:-:1 @!P2 MOV i22, RZ;
+--:-:-:-:1 @!P3 MOV i23, RZ;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>];
+
+20:-:-:-:6      LEA      track0.CC, offsetIC, track0,     [+ dshift() +];
+--:-:-:-:0      LEA.HI.X track1,    offsetIC, track1, RZ, [+ dshift() +];
+
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+
+
+
+FILTER_SETUP:
+
+--:-:1:-:2      S2R idx_K, SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+[+
+    our ($dtype, $dshift, $FX, $K, $vsize, $dsize);
+    return $FX ? qq{
+
+// writeS = (c*512 + (tid & 31)*4)*4
+--:-:-:-:1      SHL writeS, tid31, 4;
+--:-:-:-:1      ISCADD writeS, c, writeS, 11;
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<512*4*2>;
+
+// (kBlks,C,4,4,32)
+// offset = idx_K*C*512 + c*512 + tid31*4;
+--:-:-:-:1      SHL    tid31_4,  tid31, 2;
+--:-:-:-:1      XMAD   tf, idx_K, param_C, c;
+--:-:-:-:1      ISCADD tf, tf, tid31_4, 9;
+--:-:-:-:1      LEA      track0.CC, tf, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1,    tf, param_F[1], RZ, $dshift;
+
+// offsetFC = partialC*512
+--:-:-:-:1      SHL  offsetFC, partialC, 9;
+
+--:-:-:-:1 \@!P6 LDS.U.$vsize F0, [addr_zero];
+--:-:-:-:1 \@!P6 LDS.U.$vsize F1, [addr_zero];
+--:-:-:-:1 \@!P6 LDS.U.$vsize F2, [addr_zero];
+--:-:-:-:1 \@!P6 LDS.U.$vsize F3, [addr_zero];
+<ORDERED>
+--:-:2:-:1  \@P6 LDG.E.CG.$vsize F0, [track + 4x<00 * $dsize>];
+--:-:3:-:1  \@P6 LDG.E.CG.$vsize F1, [track + 4x<32 * $dsize>];
+--:-:4:-:1  \@P6 LDG.E.CG.$vsize F2, [track + 4x<64 * $dsize>];
+--:6:5:-:1  \@P6 LDG.E.CG.$vsize F3, [track + 4x<96 * $dsize>];
+</ORDERED>
+
+    } : qq{
+// k = idx_K*32 + tid & 31
+--:-:-:-:1      ISCADD  idx_K, idx_K, tid31,  5;
+--:-:-:-:1      ISETP.LT.AND P0, PT, idx_K, 1x<$K>, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, idx_K, 1x<$K>, PT;
+
+// offsetFC = partialC * RSK
+--:-:-:-:1      XMAD.LO2C offsetFC, partialC, param_RSK, RZ;
+
+// a0 = k + c*RSK
+--:-:-:-:1      XMAD.LO2C tf, c, param_RSK, idx_K;
+
+--:-:-:-:1      LEA      track0.CC, tf, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1,    tf, param_F[1], RZ, $dshift;
+
+--:-:-:-:1 \@!P0 MOV f00, RZ;
+--:-:-:-:1 \@!P0 MOV f01, RZ;
+--:-:-:-:1 \@!P0 MOV f02, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];
+--:-:-:-:1 \@!P0 MOV f20, RZ;
+--:-:-:-:1 \@!P0 MOV f21, RZ;
+--:-:-:-:1 \@!P0 MOV f22, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];
+--:-:-:-:1 \@!P0 MOV f10, RZ;
+--:-:-:-:1 \@!P0 MOV f11, RZ;
+--:-:-:-:1 \@!P0 MOV f12, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];
+--:6:2:-:1  \@P0 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];
+    };
++]
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>];
+
+20:-:-:-:6      LEA      track0.CC, offsetFC, track0,     [+ dshift() +];
+--:-:-:-:0      LEA.HI.X track1,    offsetFC, track1, RZ, [+ dshift() +];
+
+--:-:-:-:5      BRA.U FILTER_LOOP;
+
+
+IMAGE_LOOP:
+--:-:-:-:1      ISETP.GT.AND P6, PT, C, 4, PT;
+[+
+    our ($dtype, $dsize, $convert_in, $W, $N);
+    my %insert = (
+
+        j0c1  => "--:-:-:-:1      ISETP.GT.AND P5, PT, C, RZ, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -4;\n",
+
+
+        j0c14 => "--:-:-:-:1      R2P PR, preds, 0x0f;\n",
+        j0c16 => "--:-:-:-:1  \@P6 SHF.R.U64 preds, preds, 12, preds;\n",
+
+        $convert_in ? (
+            j0c3  => "02:-:-:-:1      $convert_in i00, i00;\n",
+            j0c5  => "--:-:-:-:1      $convert_in i01, i01;\n",
+            j0c7  => "--:-:-:-:1      $convert_in i02, i02;\n",
+            j0c9  => "--:-:-:-:0 \@!P6 MOV preds, RZ;\n" .
+                     "--:-:-:-:1      $convert_in i03, i03;\n",
+
+            j0c11 => "--:-:-:-:1      $convert_in i20, i20;\n",
+            j0c13 => "--:-:-:-:1      $convert_in i21, i21;\n",
+            j0c15 => "--:-:-:-:1      $convert_in i22, i22;\n",
+            j0c17 => "--:-:2:-:1      $convert_in i23, i23;\n",
+
+            j0c19 => "--:-:-:-:1      $convert_in i10, i10;\n",
+            j0c21 => "--:-:-:-:1      $convert_in i11, i11;\n",
+            j0c23 => "--:-:-:-:1      $convert_in i12, i12;\n",
+            j0c25 => "--:-:-:-:1      $convert_in i13, i13;\n",
+
+            j0c27 => "--:-:-:-:1      $convert_in i30, i30;\n",
+            j0c29 => "--:-:-:-:1      $convert_in i31, i31;\n",
+            j0c31 => "--:-:-:-:1      $convert_in i32, i32;\n",
+            j0c33 => "--:-:3:-:1      $convert_in i33, i33;\n",
+        ) : (
+            j0c9  => "--:-:-:-:1 \@!P6 MOV preds, RZ;\n",
+        ),
+
+        j0c32 => "02:-:-:-:1  \@P5 FADD TI00, i00, -i20;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI01, i01, -i21;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI02, i02, -i22;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI03, i03, -i23;\n",
+
+        j0c35 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i00, [track + ${dsize}x<0*$W*$N + 0*$N>];\n",
+        j0c37 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i01, [track + ${dsize}x<0*$W*$N + 1*$N>];\n",
+        j0c39 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i02, [track + ${dsize}x<0*$W*$N + 2*$N>];\n",
+        j0c41 => "--:-:-:-:1  \@P3 LDG.E.CI.$dtype i03, [track + ${dsize}x<0*$W*$N + 3*$N>];\n",
+        j0c43 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i00, RZ;\n",
+        j0c45 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i01, RZ;\n",
+        j0c47 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i02, RZ;\n",
+        j0c49 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i03, RZ;\n" .
+                 "--:-:-:-:1      R2P PR, preds, 0x0f;\n",
+
+        j0c50 => "--:-:-:-:1  \@P6 SHF.L.U64 preds, preds, 8, preds;\n",
+
+        j0c55 => "04:-:-:-:1  \@P5 FADD TI30, i10, -i30;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI31, i11, -i31;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI32, i12, -i32;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI33, i13, -i33;\n",
+
+        j0c57 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i30, [track + ${dsize}x<3*$W*$N + 0*$N>];\n",
+        j0c59 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i31, [track + ${dsize}x<3*$W*$N + 1*$N>];\n",
+        j0c61 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i32, [track + ${dsize}x<3*$W*$N + 2*$N>];\n",
+        j0c63 => "--:-:-:-:1  \@P3 LDG.E.CI.$dtype i33, [track + ${dsize}x<3*$W*$N + 3*$N>];\n",
+        j1c1  => "--:-:-:-:1 \@!P0 I2F.F32.U32 i30, RZ;\n",
+        j1c3  => "--:-:-:-:1 \@!P1 I2F.F32.U32 i31, RZ;\n",
+        j1c5  => "--:-:-:-:1 \@!P2 I2F.F32.U32 i32, RZ;\n",
+        j1c7  => "--:-:-:-:1 \@!P3 I2F.F32.U32 i33, RZ;\n" .
+                 "--:-:-:-:1      R2P PR, preds, 0x0f;\n" .
+                 "--:-:-:-:1  \@P5 FADD I00, TI00, -TI02;\n" .
+                 "--:-:-:-:1  \@P5 FADD I03, TI01, -TI03;\n" .
+                 "--:-:-:-:1  \@P5 FADD I30, TI30, -TI32;\n" .
+                 "--:-:-:-:1  \@P5 FADD I33, TI31, -TI33;\n" .
+                 "--:-:-:-:1  \@P6 SHF.R.U64 preds, preds, 4, preds;\n",
+
+        j1c9  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 0)>], I00;\n",
+        j1c11 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 3)>], I03;\n",
+        j1c13 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 0)>], I30;\n",
+        j1c15 => "--:3:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 3)>], I33;\n",
+
+
+        j1c29 => "04:-:-:-:1  \@P5 FADD TI10,  i10, i20;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI20, -i10, i20;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI11,  i11, i21;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI21, -i11, i21;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI12,  i12, i22;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI22, -i12, i22;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI13,  i13, i23;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI23, -i13, i23;\n",
+
+        j1c30 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i10, [track + ${dsize}x<1*$W*$N + 0*$N>];\n",
+        j1c32 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i11, [track + ${dsize}x<1*$W*$N + 1*$N>];\n",
+        j1c34 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i12, [track + ${dsize}x<1*$W*$N + 2*$N>];\n",
+        j1c36 => "--:-:-:-:1  \@P3 LDG.E.CI.$dtype i13, [track + ${dsize}x<1*$W*$N + 3*$N>];\n",
+        j1c38 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i10, RZ;\n",
+        j1c40 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i11, RZ;\n",
+        j1c42 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i12, RZ;\n",
+        j1c44 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i13, RZ;\n" .
+                 "--:-:-:-:1      R2P PR, preds, 0x0f;\n" .
+                 "--:-:-:-:1  \@P5 FADD I10, TI10, -TI12;\n" .
+                 "--:-:-:-:1  \@P5 FADD I20, TI20, -TI22;\n" .
+                 "--:-:-:-:1  \@P5 FADD I13, TI11, -TI13;\n" .
+                 "--:-:-:-:1  \@P5 FADD I23, TI21, -TI23;\n" .
+                 "--:-:-:-:1  \@P6 SHF.L.U64 preds, preds, 8, preds;\n",
+
+        j1c46 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 0)>], I10;\n",
+        j1c48 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 0)>], I20;\n",
+        j1c50 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 3)>], I13;\n",
+        j1c52 => "--:3:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 3)>], I23;\n",
+
+
+        j2c8  => "04:-:-:-:1  \@P5 FADD I21,  TI21, TI22;\n" .
+                 "--:-:-:-:1  \@P5 FADD I22, -TI21, TI22;\n",
+
+        j2c11 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 1)>], I21;\n",
+        j2c13 => "--:3:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 2)>], I22;\n",
+
+        j2c15 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i20, [track + ${dsize}x<2*$W*$N + 0*$N>];\n",
+        j2c17 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i21, [track + ${dsize}x<2*$W*$N + 1*$N>];\n",
+        j2c19 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i22, [track + ${dsize}x<2*$W*$N + 2*$N>];\n",
+        j2c21 => "--:6:2:-:1  \@P3 LDG.E.CI.$dtype i23, [track + ${dsize}x<2*$W*$N + 3*$N>];\n",
+        j2c23 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i20, RZ;\n",
+        j2c25 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i21, RZ;\n",
+        j2c27 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i22, RZ;\n",
+        j2c29 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i23, RZ;\n",
+
+        j2c30 => "04:-:-:-:1  \@P5 FADD I01,  TI01, TI02;\n" .
+                 "--:-:-:-:1  \@P5 FADD I02, -TI01, TI02;\n" .
+                 "--:-:-:-:1  \@P5 FADD I11,  TI11, TI12;\n" .
+                 "--:-:-:-:1  \@P5 FADD I12, -TI11, TI12;\n" .
+                 "--:-:-:-:1  \@P5 FADD I31,  TI31, TI32;\n" .
+                 "--:-:-:-:1  \@P5 FADD I32, -TI31, TI32;\n",
+
+        j2c31 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 1)>], I01;\n",
+        j2c33 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 2)>], I02;\n",
+        j2c35 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 1)>], I11;\n",
+        j2c37 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 2)>], I12;\n",
+        j2c39 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 1)>], I31;\n",
+        j2c41 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 2)>], I32;\n",
+
+        j2c62 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P5 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" .
+                 "--:-:-:-:1  \@P5 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" .
+                 "--:-:-:-:1  \@P5 LOP.XOR writeS, writeS, 4x<512*4*2>;\n",
+
+        j3c57 => "20:-:-:-:1  \@P6 IADD   track0.CC, track0, param_4HWNp;\n",
+        j3c62 => "--:-:-:-:1  \@P6 IADD.X track1,    track1, RZ;\n",
+
+        j3c63 => "--:-:-:Y:5  \@P5 BRA.U IMAGE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U END_LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 3)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 4;
+        my $rsPred   = $j == 3 ? '@P5' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+FILTER_LOOP:
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, RZ, PT;
+[+
+    our ($dtype, $convert_in, $FX, $vsize, $dsize, $K);
+    my %insert = (
+
+        j0c1  => "--:-:-:-:1      ISETP.GT.AND P1, PT, C, 4, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -4;\n",
+
+        $FX ? (
+            $convert_in ? (
+                j1c8  => "02:-:-:-:1  \@P0 $convert_in F03, F01.H1;\n",
+                j1c12 => "--:-:-:-:1  \@P0 $convert_in F02, F01.H0;\n",
+                j1c16 => "--:-:-:-:1  \@P0 $convert_in F01, F00.H1;\n",
+                j1c20 => "--:-:2:-:1  \@P0 $convert_in F00, F00.H0;\n",
+
+                j1c26 => "04:-:-:-:1  \@P0 $convert_in F13, F11.H1;\n",
+                j1c30 => "--:-:-:-:1  \@P0 $convert_in F12, F11.H0;\n",
+                j1c34 => "--:-:-:-:1  \@P0 $convert_in F11, F10.H1;\n",
+                j1c38 => "--:-:3:-:1  \@P0 $convert_in F10, F10.H0;\n",
+
+                j2c8  => "08:-:-:-:1  \@P0 $convert_in F23, F21.H1;\n",
+                j2c12 => "--:-:-:-:1  \@P0 $convert_in F22, F21.H0;\n",
+                j2c16 => "--:-:-:-:1  \@P0 $convert_in F21, F20.H1;\n",
+                j2c20 => "--:-:4:-:1  \@P0 $convert_in F20, F20.H0;\n",
+
+                j2c26 => "10:-:-:-:1  \@P0 $convert_in F33, F31.H1;\n",
+                j2c30 => "--:-:-:-:1  \@P0 $convert_in F32, F31.H0;\n",
+                j2c34 => "--:-:-:-:1  \@P0 $convert_in F31, F30.H1;\n",
+                j2c38 => "--:6:5:-:1  \@P0 $convert_in F30, F30.H0;\n",
+            ) : (),
+
+            j1c22 => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 00*4>], F0;\n",
+            j1c24 => "02:-:2:-:1  \@P1 LDG.E.CG.$vsize F0, [track0 + 4x<00 * $dsize>];\n",
+
+            j1c40 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 32*4>], F1;\n",
+            j1c42 => "04:-:3:-:1  \@P1 LDG.E.CG.$vsize F1, [track0 + 4x<32 * $dsize>];\n",
+
+            j2c22 => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 64*4>], F2;\n",
+            j2c24 => "08:-:4:-:1  \@P1 LDG.E.CG.$vsize F2, [track0 + 4x<64 * $dsize>];\n",
+
+            j2c40 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 96*4>], F3;\n",
+            j2c42 => "10:6:5:-:1  \@P1 LDG.E.CG.$vsize F3, [track0 + 4x<96 * $dsize>];\n",
+
+            j3c57 => "20:-:-:-:1  \@P1 IADD   track0.CC, track0, 4x<32*16 * $dsize>;\n",
+            j3c62 => "--:-:-:-:1  \@P1 IADD.X track1,    track1, RZ;\n",
+
+        ) : (
+            $convert_in ? (
+                j0c5  => "02:-:-:-:1      $convert_in f00, f00;\n",
+                j0c7  => "--:-:-:-:1      $convert_in f01, f01;\n",
+                j0c9  => "--:-:-:-:1      $convert_in f02, f02;\n",
+
+                j0c11 => "--:-:-:-:1      $convert_in f20, f20;\n",
+                j0c13 => "--:-:-:-:1      $convert_in f21, f21;\n",
+                j0c15 => "--:-:2:-:1      $convert_in f22, f22;\n",
+
+                j0c17 => "--:-:-:-:1      $convert_in f10, f10;\n",
+                j0c19 => "--:-:-:-:1      $convert_in f11, f11;\n",
+                j0c21 => "--:-:4:-:1      $convert_in f12, f12;\n",
+            ) : (),
+
+            j0c33 => "02:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 0)>], F00;\n",
+            j0c35 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 3)>], F03;\n",
+            j0c37 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 0)>], F30;\n",
+            j0c39 => "--:3:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 3)>], F33;\n",
+
+            j0c40 => "--:-:-:-:1  \@P0 FADD tb0, TF00, TF02;\n" .
+                     "--:-:-:-:1  \@P0 FADD tb3, TF30, TF32;\n" .
+                     "--:-:-:-:1  \@P0 FADD ta0, f00,  f20;\n" .
+                     "--:-:-:-:1  \@P0 FADD ta1, f01,  f21;\n" .
+                     "--:-:-:-:1  \@P0 FADD ta2, f02,  f22;\n",
+
+            j0c41 => "--:-:-:-:1  \@P0 FMUL tb0, tb0, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL tb3, tb3, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL ta0, ta0, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL ta1, ta1, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL ta2, ta2, 0.5;\n",
+
+            j0c42 => "--:-:-:-:1  \@P0 FFMA F01, TF01,  0.5, tb0;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F02, TF01, -0.5, tb0;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F31, TF31,  0.5, tb3;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F32, TF31, -0.5, tb3;\n",
+
+            j0c45 => "04:-:-:-:1  \@P1 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];\n",
+            j0c47 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];\n",
+            j0c49 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];\n",
+
+            j0c51 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];\n",
+            j0c53 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];\n",
+            j0c55 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];\n",
+
+            j1c8  => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 1)>], F01;\n",
+            j1c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 2)>], F02;\n",
+            j1c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 1)>], F31;\n",
+            j1c14 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 2)>], F32;\n",
+
+            j1c15 => "08:-:-:-:1  \@P0 FFMA TF10, f10,  0.5, ta0;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF20, f10, -0.5, ta0;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF11, f11,  0.5, ta1;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF21, f11, -0.5, ta1;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF12, f12,  0.5, ta2;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF22, f12, -0.5, ta2;\n",
+
+            j1c16 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];\n",
+            j1c18 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];\n",
+            j1c20 => "--:6:2:-:1  \@P1 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];\n",
+
+            j1c22 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 0)>], F10;\n",
+            j1c24 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 0)>], F20;\n",
+            j1c26 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 3)>], F13;\n",
+            j1c28 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 3)>], F23;\n",
+
+            j1c29 => "--:-:-:-:1  \@P0 FADD tb1, TF10, TF12;\n" .
+                     "--:-:-:-:1  \@P0 FADD tb2, TF20, TF22;\n",
+
+            j1c34 => "--:-:-:-:1  \@P0 FMUL tb1, tb1, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL tb2, tb2, 0.5;\n",
+
+            j1c39 => "--:-:-:-:1  \@P0 FFMA F11, TF11,  0.5, tb1;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F12, TF11, -0.5, tb1;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F21, TF21,  0.5, tb2;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F22, TF21, -0.5, tb2;\n",
+
+            j2c8  => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 1)>], F11;\n",
+            j2c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 2)>], F12;\n",
+            j2c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 1)>], F21;\n",
+            j2c14 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 2)>], F22;\n",
+
+
+            j3c57 => "20:-:-:-:1  \@P1 IADD   track0.CC, track0, param_4RSKp;\n",
+            j3c62 => "--:-:-:-:1  \@P1 IADD.X track1,    track1, RZ;\n",
+        ),
+
+        j2c62 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<512*4*2>;\n",
+
+        j3c63 => "--:-:-:Y:5  \@P0 BRA.U FILTER_LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 3)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 4;
+        my $rsPred   = $j == 3 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+END_LOOP:
+--:-:1:-:1      S2R tid,   SR_TID.X;
+--:-:2:-:1      S2R idx_N, SR_CTAID.Z;
+--:-:3:-:1      S2R idx_K, SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      LOP.AND tid_31, tid, 31;
+
+--:-:-:-:1      BFE idx_n, idx_nkpq, 0x041c;
+--:-:-:-:1      BFE idx_k, idx_nkpq, 0x0418;
+--:-:-:-:1      BFE idx_P, idx_nkpq, 0x0c0c;
+--:-:-:-:1      BFE idx_Q, idx_nkpq, 0x0c00;
+
+02:-:-:-:1      XMAD idx_N, idx_N, param_n, idx_n;
+04:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+[+
+    our $bsum; return $bsum ? q{
+--:-:-:-:1      XMAD      bsum_offset, idx_Q, param_gridN,   idx_N;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_P, param_gridQN,  bsum_offset;
+    } : '';
++]
+
+// x = grid_x << shiftX
+// y = grid_y << shiftY
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+
+// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
+--:-:-:-:1      BFE.U32 superP, tid, param_superP;
+--:-:-:-:1      BFE.U32 superQ, tid, param_superQ;
+--:-:-:-:1      ISCADD p, superP,  p, 1;
+--:-:-:-:1      ISCADD q, superQ,  q, 1;
+
+
+--:-:-:-:1      LOP.AND superN, tid, param_superN;
+--:-:-:-:1      SHL  n, idx_N, param_shiftN;
+--:-:-:-:1      IADD n, n, superN;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV32I one, 1.0;
+
+// readFs = ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid_1,  tid,    1;
+--:-:-:-:1      LOP.AND  readFs, tid,    8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      LOP.OR   readFs, readFs,  tid_1;
+//--:-:-:-:1      SHL      readFs, readFs, 3;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2)
+--:-:-:-:1      LOP.AND  tid_16, tid,   -16;
+--:-:-:-:1      SHR.U32  tid_16, tid_16, 1;
+--:-:-:-:1      BFE.U32  readIs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid_16;
+--:-:-:-:1      ISCADD   readIs, readFs, readIs, 2;
+--:-:-:-:1      SHL      readIs, readIs, 4;
+
+// writeCs = readFs * 512 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 12;
+
+// readCs = tid32 * 512 + tid_31 + tid_64 * 16
+--:-:-:-:1      SHR.U32 tid32, tid,  5;
+--:-:-:-:1      SHR.U32 tid64, tid,  6;
+--:-:-:-:1      ISCADD  readCs, tid32, tid_31, 9;
+--:-:-:-:1      ISCADD  readCs, tid64, readCs, 4;
+--:-:-:-:1      SHL     readCs, readCs, 2;
+
+// k = idx_K*32 + tid32<<1
+--:-:-:-:1      SHL tid32, tid32, 1;
+--:-:-:-:1      ISCADD  k, idx_K, tid32, 5;
+
+// Out00 = k*PQN + p*QN + q*N + n
+// Out01 = Out00 + N
+// Out10 = Out00 + QN
+// Out11 = Out01 + QN
+--:-:-:-:1      XMAD      out_offset, q, 1x<$N>,    n;
+--:-:-:-:1      XMAD.LO2C out_offset, p, param_QN,  out_offset;
+--:-:-:-:1      XMAD.LO2C out_offset, k, param_PQN, out_offset;
+
+
+--:-:-:-:1      MOV  PQN15, param_PQN;
+--:-:-:-:1      SHL  PQN15, PQN15, 4;
+--:-:-:-:1      IADD PQN15, PQN15, -param_PQN;
+
+--:-:-:-:1      IADD q2, q, 1;
+--:-:-:-:1      IADD p2, p, 1;
+
+
+--:-:-:-:1      ISETP.EQ.AND P6, PT, RZ, param_flags, PT; // ! no-op
+--:-:-:-:1      ISETP.LT.AND P6, PT, n,  1x<$N>,  P6; // n < N
+--:-:-:-:1      ISETP.LT.AND P2, PT, p,  param_P, PT; // p0 < P && n < N
+--:-:-:-:1      ISETP.LT.AND P3, PT, q,  1x<$Q>,  PT; // q0 < Q && n < N
+--:-:-:-:1      ISETP.LT.AND P4, PT, p2, param_P, PT; // p1 < P && n < N
+--:-:-:-:1      ISETP.LT.AND P5, PT, q2, 1x<$Q>,  PT; // q1 < Q && n < N
+
+--:-:-:-:1      PSETP.AND.AND P0, PT, P2, P3, P6; // p0 && q0
+--:-:-:-:1      PSETP.AND.AND P1, PT, P2, P5, P6; // p0 && q1
+--:-:-:-:1      PSETP.AND.AND P2, PT, P4, P3, P6; // p1 && q0
+--:-:-:-:1      PSETP.AND.AND P3, PT, P4, P5, P6; // p1 && q1
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+--:-:-:-:1      ISETP.EQ.AND P6, PT, tid_31, RZ, PT; // tid31 == 0
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, cx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, cx7y2, alpha;
+
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 1;
+--:-:-:-:0      IADD out_offset, out_offset, param_PQN;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 15;
+--:-:-:-:0      IADD out_offset, out_offset, PQN15;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y4, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y6, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 1;
+--:-:-:-:0      IADD out_offset, out_offset, param_PQN;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+
+<SCHEDULE_BLOCK>
+11:-:-:-:1      ISETP.LT.AND P4, PT, k, 1x<$K>, PT; // k < K
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+[+
+    our ($beta, $brelu, $bprelu, $dsize, $dshift, $dtype, $Q, $N);
+    return $beta || $brelu || $bprelu ? qq{
+--:-:-:-:1      LEA      Out0.CC, out_offset, param_X[0],     $dshift;
+--:-:-:-:1      LEA.HI.X Out1,    out_offset, param_X[1], RZ, $dshift;
+
+--:-:-:-:1 \@!P0 MOV b00, RZ;
+--:-:-:-:1 \@!P1 MOV b01, RZ;
+--:-:-:-:1 \@!P2 MOV b10, RZ;
+--:-:-:-:1 \@!P3 MOV b11, RZ;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>];
+--:-:5:-:1  \@P1 LDG.E.CI.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>];
+--:-:6:-:1  \@P3 LDG.E.CI.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>];
+</ORDERED>
+    } : '';
++]
+[+
+    our $bias; return $bias ? q{
+// sum = S + k
+20:-:-:-:1      LEA      Sum0.CC, k, param_S[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    k, param_S[1], RZ, 2;
+
+--:-:-:-:1 @!P4 MOV b00, RZ;
+--:-:5:-:1  @P4 LDG.E.CI b00, [Sum];
+    } : '';
++]
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      LDS m00, [readCs + 4x< 0*32>];
+--:-:-:-:1      LDS m01, [readCs + 4x< 1*32>];
+--:-:-:-:1      LDS m02, [readCs + 4x< 2*32>];
+--:-:1:Y:1      LDS m03, [readCs + 4x< 3*32>];
+
+--:-:-:-:1      LDS m10, [readCs + 4x< 4*32>];
+--:-:-:-:1      LDS m11, [readCs + 4x< 5*32>];
+--:-:-:-:1      LDS m12, [readCs + 4x< 6*32>];
+--:-:2:Y:1      LDS m13, [readCs + 4x< 7*32>];
+
+--:-:-:-:1      LDS m20, [readCs + 4x< 8*32>];
+--:-:-:-:1      LDS m21, [readCs + 4x< 9*32>];
+--:-:-:-:1      LDS m22, [readCs + 4x<10*32>];
+--:-:3:Y:1      LDS m23, [readCs + 4x<11*32>];
+
+--:-:-:-:1      LDS m30, [readCs + 4x<12*32>];
+--:-:-:-:1      LDS m31, [readCs + 4x<13*32>];
+--:-:-:-:1      LDS m32, [readCs + 4x<14*32>];
+--:-:4:Y:1      LDS m33, [readCs + 4x<15*32>];
+
+<SCHEDULE_BLOCK>
+// t00 = m00+m01+m02;
+// t01 = m01-m02-m03;
+01:-:-:-:1      FADD t00, m00,  m01;
+--:-:-:-:1      FADD t00, t00,  m02;
+--:-:-:-:1      FADD t01, m01, -m02;
+--:-:-:-:1      FADD t01, t01, -m03;
+// t10 = m10+m11+m12;
+// t11 = m11-m12-m13;
+02:-:-:-:1      FADD t10, m10,  m11;
+--:-:-:-:1      FADD t10, t10,  m12;
+--:-:-:-:1      FADD t11, m11, -m12;
+--:-:-:-:1      FADD t11, t11, -m13;
+// t20 = m20+m21+m22;
+// t21 = m21-m22-m23;
+04:-:-:-:1      FADD t20, m20,  m21;
+--:-:-:-:1      FADD t20, t20,  m22;
+--:-:-:-:1      FADD t21, m21, -m22;
+--:-:-:-:1      FADD t21, t21, -m23;
+// t30 = m30+m31+m32;
+// t31 = m31-m32-m33;
+08:-:-:-:1      FADD t30, m30,  m31;
+--:-:-:-:1      FADD t30, t30,  m32;
+--:-:-:-:1      FADD t31, m31, -m32;
+--:-:-:-:1      FADD t31, t31, -m33;
+// y00 = t00+t10+t20;
+// y01 = t01+t11+t21;
+--:-:-:-:1      FADD s00, t00,  t10;
+--:-:-:-:1      FADD s00, s00,  t20;
+--:-:-:-:1      FADD s01, t01,  t11;
+--:-:-:-:1      FADD s01, s01,  t21;
+// y10 = t10-t20-t30;
+// y11 = t11-t21-t31;
+--:-:-:-:1      FADD s10, t10, -t20;
+--:-:-:-:1      FADD s10, s10, -t30;
+--:-:-:-:1      FADD s11, t11, -t21;
+--:-:-:-:3      FADD s11, s11, -t31;
+
+[+
+    our $bias; return $bias ? q{
+10:-:-:-:1  @P0 FADD s00, s00, b00;
+--:-:-:-:1  @P1 FADD s01, s01, b00;
+--:-:-:-:1  @P2 FADD s10, s10, b00;
+--:-:-:-:1  @P3 FADD s11, s11, b00;
+    } : '';
++]
+[+
+    our $relu; return $relu ? q{
+// maximum(x, 0) + slope * minimum(0, x)
+--:-:-:-:1  @P0 FMNMX s00, s00, RZ, !PT;
+--:-:-:-:1  @P1 FMNMX s01, s01, RZ, !PT;
+--:-:-:-:1  @P2 FMNMX s10, s10, RZ, !PT;
+--:-:-:-:1  @P3 FMNMX s11, s11, RZ, !PT;
+    } : '';
++]
+[+
+    our $prelu; return $prelu ? q{
+// maximum(x, 0) + slope * minimum(0, x)
+--:-:-:-:1  @P0 FMNMX b00, s00, RZ, !PT;
+--:-:-:-:1  @P1 FMNMX b01, s01, RZ, !PT;
+--:-:-:-:1  @P2 FMNMX b10, s10, RZ, !PT;
+--:-:-:-:1  @P3 FMNMX b11, s11, RZ, !PT;
+
+--:-:-:-:1  @P0 FMNMX x00, s00, RZ, PT;
+--:-:-:-:1  @P1 FMNMX x01, s01, RZ, PT;
+--:-:-:-:1  @P2 FMNMX x10, s10, RZ, PT;
+--:-:-:-:1  @P3 FMNMX x11, s11, RZ, PT;
+
+--:-:-:-:1  @P0 FFMA s00, x00, param_beta, b00;
+--:-:-:-:1  @P1 FFMA s01, x01, param_beta, b01;
+--:-:-:-:1  @P2 FFMA s10, x10, param_beta, b10;
+--:-:-:-:1  @P3 FFMA s11, x11, param_beta, b11;
+    } : '';
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    our ($beta, $brelu, $bprelu, $convert_in);
+    return $convert_in && ($beta || $brelu || $bprelu) ? qq{
+10:-:1:-:1  \@P0 $convert_in b00, b00;
+--:-:2:-:1  \@P1 $convert_in b01, b01;
+20:-:3:-:1  \@P2 $convert_in b10, b10;
+--:-:4:-:1  \@P3 $convert_in b11, b11;
+    } : '';
++]
+[+
+    our $beta; return $beta ? q{
+11:-:-:-:1  @P0 FFMA s00, b00, param_beta, s00;
+02:-:-:-:1  @P1 FFMA s01, b01, param_beta, s01;
+24:-:-:-:1  @P2 FFMA s10, b10, param_beta, s10;
+08:-:-:-:1  @P3 FFMA s11, b11, param_beta, s11;
+    } : '';
++]
+[+
+    our $brelu; return $brelu ? q{
+//delta *= x > 0
+11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
+02:-:-:-:1      FSETP.GT.AND P1, PT, b01, RZ, PT;
+24:-:-:-:1      FSETP.GT.AND P2, PT, b10, RZ, PT;
+08:-:-:-:1      FSETP.GT.AND P3, PT, b11, RZ, PT;
+--:-:-:-:1 @!P0 MOV s00, RZ;
+--:-:-:-:1 @!P1 MOV s01, RZ;
+--:-:-:-:1 @!P2 MOV s10, RZ;
+--:-:-:-:1 @!P3 MOV s11, RZ;
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:5 @!P4 R2P PR, RZ, 0x0f;
+    } : '';
++]
+[+
+    our $bprelu; return $bprelu ? q{
+//delta *= ((x > 0) + slope * (x < 0))
+11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
+02:-:-:-:1      FSETP.GT.AND P1, PT, b01, RZ, PT;
+24:-:-:-:1      FSETP.GT.AND P2, PT, b10, RZ, PT;
+08:-:-:-:1      FSETP.GT.AND P3, PT, b11, RZ, PT;
+--:-:-:-:1      SEL x00, one, RZ, P0;
+--:-:-:-:1      SEL x01, one, RZ, P1;
+--:-:-:-:1      SEL x10, one, RZ, P2;
+--:-:-:-:1      SEL x11, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b00, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b01, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b10, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b11, RZ, PT;
+--:-:-:-:1      SEL b00, one, RZ, P0;
+--:-:-:-:1      SEL b01, one, RZ, P1;
+--:-:-:-:1      SEL b10, one, RZ, P2;
+--:-:-:-:1      SEL b11, one, RZ, P3;
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+--:-:-:-:1      FFMA b00, b00, param_beta, x00;
+--:-:-:-:1      FFMA b01, b01, param_beta, x01;
+--:-:-:-:1      FFMA b10, b10, param_beta, x10;
+--:-:-:-:1      FFMA b11, b11, param_beta, x11;
+--:-:-:-:1      FMUL s00, s00, b00;
+--:-:-:-:1      FMUL s01, s01, b01;
+--:-:-:-:1      FMUL s10, s10, b10;
+--:-:-:-:1      FMUL s11, s11, b11;
+    } : '';
++]
+[+
+    our $bsum; return $bsum ? q{
+--:-:-:-:1      MOV sum0, RZ;
+--:-:-:-:1  @P0 FADD sum0, s00, sum0;
+--:-:-:-:1  @P1 FADD sum0, s01, sum0;
+--:-:-:-:1  @P2 FADD sum0, s10, sum0;
+--:-:-:-:1  @P3 FADD sum0, s11, sum0;
+    } : '';
++]
+</SCHEDULE_BLOCK>
+
+[+
+    our $convert_out;
+    return $convert_out ? qq{
+--:-:1:-:1      $convert_out s00, s00;
+--:-:2:-:1      $convert_out s01, s01;
+--:-:3:-:1      $convert_out s10, s10;
+--:-:4:-:1      $convert_out s11, s11;
+    } : '';
++]
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out0.CC, out_offset, param_O[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X Out1,    out_offset, param_O[1], RZ, [+ dshift() +];
+
+// k < K && R2P && output
+01:-:-:-:1  @P0 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 0*$N>], s00;
+02:-:-:-:1  @P1 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 1*$N>], s01;
+04:-:-:-:1  @P2 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 0*$N>], s10;
+08:1:-:-:1  @P3 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 1*$N>], s11;
+</SCHEDULE_BLOCK>
+
+[+
+    our $bsum;
+    return $bsum ? q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      XMAD.LO2C b00, k, param_gridPQN, bsum_offset;
+
+--:-:-:-:1      LEA      Sum0.CC, b00, param_S[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    b00, param_S[1], RZ, 2;
+
+--:-:-:-:1      PSETP.AND.AND P5, PT, P4, P6, PT; // k < K && tid31 == 0
+
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  1, 0x1f;
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  2, 0x1f;
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  4, 0x1f;
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  8, 0x1f;
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 16, 0x1f;
+10:-:-:-:2      FADD sum0, sum1, sum0;
+
+--:5:-:-:1  @P5 STG.E.CG [Sum], sum0;
+</SCHEDULE_BLOCK>
+    } : '';
++]
+
+--:-:-:-:5      RET;
+
+
+
+//     T0 = np.empty((4,4))
+//     T1 = np.empty((4,4))
+//
+//     for O, I in ((T0, I), (T1, T0.T)):
+//
+//         O[0,:] = I[0,:] - I[2,:]
+//         O[1,:] = I[1,:] + I[2,:]
+//         O[2,:] = I[2,:] - I[1,:]
+//         O[3,:] = I[1,:] - I[3,:]
+//
+//     Iw[:] = T1.T
+//
+// 0  = i00
+// 1  = i01
+// 2  = i02
+// 3  = i03
+// 4  = i30
+// 5  = i31
+// 6  = i32
+// 7  = i33
+// 8  = i13
+// 9  = i12
+// 10 = i11
+// 11 = i10
+// 12 = i23, TI23, I23
+// 13 = i22, TI22
+// 14 = i21, TI21
+// 15 = i20, TI20, I20
+// 16 = TI00, I00, TI10, I10, I21, I01
+// 17 = TI01, I11
+// 18 = TI02, I12
+// 19 = TI03, I03, TI11, I31
+// 20 = TI30, I30, TI12, I32
+// 21 = TI31
+// 22 = TI32
+// 23 = TI33, I33, TI13, I13, I22, I02
+//
+//
+// TI00 = i00 - i20
+// TI01 = i01 - i21
+// TI02 = i02 - i22
+// TI03 = i03 - i23
+// # load 0
+//
+// TI30 = i10 - i30
+// TI31 = i11 - i31
+// TI32 = i12 - i32
+// TI33 = i13 - i33
+// # load 3
+//
+// I00 = TI00 - TI02
+// I03 = TI01 - TI03
+// I30 = TI30 - TI32
+// I33 = TI31 - TI33
+// # store 0
+//
+// # wait 0
+// TI10 = i10 + i20
+// TI11 = i11 + i21
+// TI12 = i12 + i22
+// TI13 = i13 + i23
+//
+// TI20 = i20 - i10
+// TI21 = i21 - i11
+// TI22 = i22 - i12
+// TI23 = i23 - i13
+//
+// #load 1
+//
+// I10 = TI10 - TI12
+// I20 = TI20 - TI22
+// I13 = TI11 - TI13
+// I23 = TI21 - TI23
+// # store 1
+//
+// # wait 1
+// I21 = TI21 + TI22
+// I22 = TI22 - TI21
+// # store 2
+//
+// # load 2
+//
+// # wait 2
+// I01 = TI01 + TI02
+// I02 = TI02 - TI01
+// I11 = TI11 + TI12
+// I12 = TI12 - TI11
+// I31 = TI31 + TI32
+// I32 = TI32 - TI31
+// #store 3
+
+
+
+//     T0 = np.empty((4,3))
+//     T1 = np.empty((4,4))
+//
+//     for O, I in ((T0, F), (T1, T0.T)):
+//
+//         t0 = (I[0,:] + I[2,:])*0.5
+//
+//         O[0,:] = I[0,:]
+//         O[1,:] = t0 + I[1,:]*0.5
+//         O[2,:] = t0 - I[1,:]*0.5
+//         O[3,:] = I[2,:]
+//
+//     Fw[:] = T1.T
+//
+// 0  = f00, TF00, F00
+// 1  = f01, TF01
+// 2  = f02, TF02, F03
+// 3  = f10
+// 4  = f11
+// 5  = f12
+// 6  = f20, TF30, F30
+// 7  = f21, TF31
+// 8  = f22, TF32, F33
+// 9  = tb3, F32
+// 10 = tb0, F02
+// 11 = ta2, TF22, F23
+// 12 = ta0, TF20, F20
+// 13 = ta1, TF21
+// 14 = F01
+// 15 = F31
+// 16 = TF10, F10
+// 17 = TF11
+// 18 = TF12, F13
+// 19 = tb1, F12
+// 20 = tb2, F22
+// 21 = F11
+// 22 = F21
+// 23 =
+//
+//
+// TF00 = f00
+// TF01 = f01
+// TF02 = f02
+// TF30 = f20
+// TF31 = f21
+// TF32 = f22
+//
+// F00 = TF00
+// F03 = TF02
+// F30 = TF30
+// F33 = TF32
+//
+// # store 0
+//
+// tb0 = TF00 + TF02
+// tb3 = TF30 + TF32
+// ta0 = f00 + f20
+// ta1 = f01 + f21
+// ta2 = f02 + f22
+//
+// tb0 = tb0 * 0.5
+// tb3 = tb3 * 0.5
+// ta0 = ta0 * 0.5
+// ta1 = ta1 * 0.5
+// ta2 = ta2 * 0.5
+//
+// F01 = tb0 + TF01*0.5
+// F02 = tb0 - TF01*0.5
+// F31 = tb3 + TF31*0.5
+// F32 = tb3 - TF31*0.5
+//
+// # wait 0
+// # load 0, 2
+// # store 1
+//
+// TF10 = ta0 + f10*0.5
+// TF20 = ta0 - f10*0.5
+// TF11 = ta1 + f11*0.5
+// TF21 = ta1 - f11*0.5
+// TF12 = ta2 + f12*0.5
+// TF22 = ta2 - f12*0.5
+//
+// # load 1
+//
+// F10 = TF10
+// F20 = TF20
+// F13 = TF12
+// F23 = TF22
+//
+// # store 2
+//
+// tb1 = TF10 + TF12
+// tb2 = TF20 + TF22
+// tb1 = tb1 * 0.5
+// tb2 = tb2 * 0.5
+//
+// F11 = tb1 + TF11*0.5
+// F12 = tb1 - TF11*0.5
+// F21 = tb2 + TF21*0.5
+// F22 = tb2 - TF21*0.5
+//
+// # store 3//
\ No newline at end of file
diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_2x2_5x5_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_2x2_5x5_32x32.sass
new file mode 100644
index 0000000..0fcb767
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_winograd_2x2_5x5_32x32.sass
@@ -0,0 +1,1589 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $type;
+our $dtype       = $type eq 'h' ?         'U16' :  '32';
+our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' :    '';
+our $convert_out = $type eq 'h' ? 'F2F.F16.F32' :    '';
+our $dshift      = $type eq 'h' ?           '1' :   '2';
+our $dsize       = $type eq 'h' ?           '2' :   '4';
+our $vsize       = $type eq 'h' ?          '64' : '128';
+sub dtype  { return $dtype;  }
+sub dsize  { return $dsize;  }
+sub dshift { return $dshift; }
+sub vsize  { return $vsize;  }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero   : 4x<32*36*2*4 + 64 + 0>
+    addr_idx_Y  : 4x<32*36*2*4 + 64 + 4>
+    addr_idx_X  : 4x<32*36*2*4 + 64 + 5>
+    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>
+
+    param_O[0]           : c[0x0][0x140]
+    param_O[1]           : c[0x0][0x144]
+    param_I[0]           : c[0x0][0x148]
+    param_I[1]           : c[0x0][0x14c]
+    param_F[0]           : c[0x0][0x150]
+    param_F[1]           : c[0x0][0x154]
+    param_alpha          : c[0x0][0x158]
+    param_flags          : c[0x0][0x15c]
+    param_C              : c[0x0][0x160]
+    param_K              : c[0x0][0x164]
+    param_N              : c[0x0][0x168]
+    param_H              : c[0x0][0x16c]
+    param_W              : c[0x0][0x170]
+    param_HWN            : c[0x0][0x174]
+    param_WN             : c[0x0][0x178]
+    param_Y2             : c[0x0][0x17c]
+    param_GX             : c[0x0][0x180]
+    param_Xk             : c[0x0][0x184]
+    param_k              : c[0x0][0x188]
+    param_magic_Xk       : c[0x0][0x18c]
+    param_shift_Xk       : c[0x0][0x190]
+    param_magic_k        : c[0x0][0x194]
+    param_shift_k        : c[0x0][0x198]
+    param_P              : c[0x0][0x19c]
+    param_Q              : c[0x0][0x1a0]
+    param_QN             : c[0x0][0x1a4]
+    param_PQN            : c[0x0][0x1a8]
+    param_PQNp           : c[0x0][0x1ac]
+    param_PQN15p         : c[0x0][0x1b0]
+    param_shiftY         : c[0x0][0x1b4]
+    param_shiftX         : c[0x0][0x1b8]
+    param_shiftN         : c[0x0][0x1bc]
+    param_superY         : c[0x0][0x1c0]
+    param_superX         : c[0x0][0x1c4]
+    param_superN         : c[0x0][0x1c8]
+    param_SuperY         : c[0x0][0x1cc]
+    param_SuperX         : c[0x0][0x1d0]
+    param_SuperN         : c[0x0][0x1d4]
+    param_pad_x          : c[0x0][0x1d8]
+    param_pad_y          : c[0x0][0x1dc]
+    param_HWN2p          : c[0x0][0x1e0]
+    param_C_1152         : c[0x0][0x1e4]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+      // Image Transform
+      52 = i00, TI00, I00
+      53 = i10, TI50, I50
+      54 = i01, TI01, I05
+      55 = i11, TI51, I55
+      56 = TI10, I10
+      57 = TI20, I20
+      58 = TI30, I30
+      59 = TI40, I40
+      60 = TI41, I45
+      61 = TI31, I35
+      62 = TI21, I25
+      63 = TI11, I15
+      64-67 : I0<1-4>
+      68-71 : I5<1-4>
+      72-75 : I1<1-4>
+      76-79 : I2<1-4>
+      80-83 : I3<1-4>
+      84-87 : I4<1-4>
+
+      // Filter Transform
+      52-87 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>, F4<0-3>, F5<0-3>, F6<0-3>, F7<0-3>, F8<0-3>
+
+      // Load Loop Registers
+     3, 2,11,10 : clx<0-3>y0
+     7, 6,15,14 : clx<0-3>y1
+     1, 0, 9, 8 : clx<0-3>y2
+     5, 4,13,12 : clx<0-3>y3
+    19,18,27,26 : clx<0-3>y4
+    23,22,31,30 : clx<0-3>y5
+    17,16,25,24 : clx<0-3>y6
+    21,20,29,28 : clx<0-3>y7
+
+      32-43 : jl0Ix<0-3>, jl0Fy<0-7>
+      44-51 : jl1Ix<0-3>, jl1Fy<4-7>
+      36-39 : jl1Fy<0-3>
+
+      32-51 ~ partialC, c, idx_K, idx_Y, idx_X, idx_N, tid31, gx, gy, offset, nn, x1, x2, y1, mask_x
+      52-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, idx_Y2, idx_X2, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, super_x, super_y
+         87 = tid
+
+     // Compute Loop Registers
+     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
+     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
+     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
+    35,34,43,42,51,50,59,58 : ccx<0-7>y4
+    39,38,47,46,55,54,63,62 : ccx<0-7>y5
+    33,32,41,40,49,48,57,56 : ccx<0-7>y6
+    37,36,45,44,53,52,61,60 : ccx<0-7>y7
+
+      64-79 : jc0Ix<0-7>, jc0Fy<0-7>
+      80-91 : jc1Ix<4-7>, jc1Fy<0-7>
+      64-67 : jc1Ix<0-3>
+
+      64-86 ~ tid16, tid_1, tid128
+
+      // Shared Registers
+      88-89 : track<0-1>
+      92-95 ~ C, swapBuf, readFs, readIs
+      90-91 ~ writeS, preds
+
+      // Load Loop Finish
+      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
+      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16
+
+      // Compute Loop Finish
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+      64-87 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, four, z<1-5>, mask_q, offsetO, sign
+      90-95 ~ writeCs, readCs, k, pred30, pred36, tid31_4
+      88-89 : Out<0-1>
+
+      3, 2,11,10,19,18 : m<0-5>0
+     27, 1,26, 0, 9, 8 : m<0-5>1
+     16,17,24,25,64,65 : m<0-5>2
+     66,67,68,69,70,71 : m<0-5>3
+     72,73,74,75,76,77 : m<0-5>4
+     78,79,80,81,82,83 : m<0-5>5
+
+      3, 2,11,10,19,18 : w<0-5>0
+     27, 1,26, 0, 9, 8 : w<0-5>1
+     16,17,24,25,64,65 : w<0-5>2
+     66,67,68,69,70,71 : w<0-5>3
+     72,73,74,75,76,77 : w<0-5>4
+     78,79,80,81,82,83 : w<0-5>5
+
+      3, 2,11,10,19,18 : s<0-5>0
+     27, 1,26, 0, 9, 8 : s<0-5>1
+     16,17,24,25,64,65 : s<0-5>2
+     66,67,68,69,70,71 : s<0-5>3
+     72,73,74,75,76,77 : s<0-5>4
+     78,79,80,81,82,83 : s<0-5>5
+
+           85,84,86,87 : t<0-3>0
+           85,87,84,86 : t<0-3>1
+           85,84,87,86 : t<0-3>2
+           85,84,87,86 : t<0-3>3
+           85,84,87,86 : t<0-3>4
+           85,84,87,86 : t<0-3>5
+           85,84,87,86 : r0<0-3>
+           85,84,87,86 : r1<0-3>
+           85,87,86,84 : r2<0-3>
+           84,85,86,87 : r3<0-3>
+           85,84,87,86 : r4<0-3>
+           84,85,87,86 : r5<0-3>
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV C,   param_C;
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:-:-:1      MOV swapBuf, 4x<32*36*2*2>;
+01:-:-:-:0      ISETP.GE.AND P0, PT, tid, 128, PT;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+--:-:-:Y:c      LOP.AND partialC, C, 1;
+--:-:-:-:0      IADD C, C, partialC;
+--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;
+
+##############################################################
+LOAD_SETUP:
+
+--:-:1:-:1      S2R idx_YXk, SR_CTAID.X;
+--:-:2:-:1      S2R idx_K,   SR_CTAID.Y;
+--:-:3:-:1      S2R idx_N,   SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]
+
+--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;
+
+// idx_Y2   = idx_YXk / blk_Xk
+--:-:-:-:1      MOV  magic_Xk, param_magic_Xk;
+--:-:-:-:1      IADD negXk, RZ, -param_Xk;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_Xk, 1, PT;
+01:-:-:-:1  @P3 XMAD     div1, idx_YXk,    magic_Xk,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, idx_YXk,    magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, idx_YXk.H1, magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk,    div1;
+--:-:-:-:1  @P3 IADD3.RS idx_Y2, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  idx_Y2, idx_Y2,  param_shift_Xk;
+--:-:-:-:1 @!P3 SHR.U32  idx_Y2, idx_YXk, param_shift_Xk;
+
+// idx_Xk  = idx_YXk % blk_Xk
+--:-:-:-:1      XMAD.LO2 idx_Xk, negXk, idx_Y2, idx_YXk;
+
+// idx_X2   = idx_Xk / blk_k
+// idx_k   = idx_Xk % blk_k
+--:-:-:-:1      XMAD    idx_X2,  idx_Xk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_X2,  idx_X2, param_shift_k;
+--:-:-:-:1      XMAD    idx_k,   idx_X2, param_k, RZ;
+--:-:-:-:1      IADD    idx_k,  -idx_k,  idx_Xk;
+
+// idx_K = idx_K * blk_k + idx_k
+02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+//--:-:-:-:1      MOV idx_X, idx_X2;
+//--:-:-:-:1      MOV idx_Y, idx_Y2;
+
+// gx = x2
+// gy = y2 * 2
+--:-:-:-:1      MOV idx_X, idx_X2;
+--:-:-:-:1      SHL idx_Y, idx_Y2, 1;
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// if y2 != Y2:
+//     gy += (gx&1) ^ ((gx&2)>>1)
+//     gx /= 2
+--:-:-:-:1      ISETP.NE.AND P4, PT, idx_Y2, param_Y2, PT;
+--:-:-:-:1  @P4 LOP.AND x1, idx_X, 1;
+--:-:-:-:1  @P4 BFE.U32 x2, idx_X, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P4 LOP.XOR x1, x1, x2;
+--:-:-:-:1  @P4 IADD idx_Y, idx_Y, x1;
+--:-:-:-:1  @P4 SHR.U32 idx_X, idx_X, 1;
+
+// Scan backwards on odd rows
+// if y2 & 1:
+//     gx = gridX - gx - 1
+--:-:-:-:1      LOP.AND.NZ P5, RZ, idx_Y2, 1;
+--:-:-:-:1  @P5 IADD idx_X, -idx_X,  param_GX;
+--:-:-:-:1  @P5 IADD idx_X,  idx_X, -1;
+
+--:-:-:-:1  @P0 STS [addr_idx_Y], idx_Y;
+--:-:-:-:1  @P0 STS [addr_idx_X], idx_X;
+--:-:-:-:1  @P0 STS [addr_idx_K], idx_K;
+
+// x = gx << shiftX
+// y = gy << shiftY
+--:-:-:-:1      SHL gx, idx_X, param_shiftX;
+--:-:-:-:1      SHL gy, idx_Y, param_shiftY;
+
+// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
+--:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superY;
+--:-:-:-:1      ISCADD gx, super_x,  gx, 1;
+--:-:-:-:1      ISCADD gy, super_y,  gy, 1;
+
+--:-:-:-:1      LOP.AND  tid32_2,  tid,   -32;
+--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid32_2;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD3   readFs, readFs, tid1, tid32_2;
+--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;
+
+// c = (tid & 32) >> 5
+--:-:-:-:1      BFE.U32 c, tid, 0x105; // 1 bits at position 5
+
+// P6 = c == partialC == 1
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, partialC, P6;
+
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+</SCHEDULE_BLOCK>
+
+04:-:-:-:5  @P1 BRA.U FILTER_SETUP;
+
+##############################################################
+IMAGE_SETUP:
+
+<SCHEDULE_BLOCK>
+
+// writeS = c*32*36 + tid31
+--:-:-:-:1      XMAD writeS, c, 1152, tid31;
+--:-:-:-:1      SHL  writeS, writeS, 2;
+
+--:-:-:-:1      STS [writeS + 4x<32*0>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*1>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*2>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*3>], RZ;
+
+// n = idx_N<<shiftN + tid & superN
+--:-:-:-:1      SHL idx_N, idx_N, param_shiftN;
+--:-:-:-:1      LOP.AND nn, tid,  param_superN;
+--:-:-:-:1      IADD    nn, nn, idx_N;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P4, PT, nn, param_N, PT;
+
+// offset = c*YXN + y0*XN + x0*N + n;
+--:-:-:-:1      XMAD.S16.U16      offset, gx, param_N,   nn;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, gy, param_WN,  offset;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, c,  param_HWN, offset;
+
+--:-:-:-:1      LEA      track0.CC, offset, param_I[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X track1,    offset, param_I[1], RZ, [+ dshift() +];
+
+--:-:-:-:1      IADD x1, gx, 1;
+--:-:-:-:1      ISETP.LT.AND P0, PT, gx, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_W, PT;
+--:-:-:-:1      ISETP.GE.AND P0, PT, gx, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x3;
+
+--:-:-:-:1      IADD y1, gy, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, gy, param_H, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y1, param_H, P4;
+--:-:-:-:1      ISETP.GE.AND P2, PT, gy, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, y1, RZ, P3;
+
+--:-:-:-:1      SEL preds, mask_x, RZ, P2;
+--:-:-:-:1  @P3 BFI preds, mask_x, 0x202, preds;
+
+
+--:-:-:-:1      XMAD partialC, partialC, param_HWN, RZ;
+--:-:-:-:1      SHL  partialC, partialC, [+ dshift() +];
+
+--:-:-:-:1 @!P6 R2P PR, preds, 0xf;
+--:-:-:-:1  @P6 R2P PR,    RZ, 0xf;
+
+<ORDERED>
+--:-:-:-:1 @!P0 MOV i00, RZ;
+--:-:2:-:1  @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>];
+--:-:-:-:1 @!P2 MOV i10, RZ;
+--:-:3:-:1  @P2 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>];
+--:-:-:-:1 @!P1 MOV i01, RZ;
+--:-:4:-:1  @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>];
+--:-:-:-:1 @!P3 MOV i11, RZ;
+--:6:5:-:1  @P3 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>];
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+20:-:-:-:0      IADD   track0.CC, track0, -partialC;
+
+--:-:-:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:1:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+--:-:-:-:1      IADD   writeS,    writeS,  swapBuf;
+--:-:-:-:2      IADD   swapBuf,       RZ, -swapBuf;
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+
+##############################################################
+FILTER_SETUP:
+
+<SCHEDULE_BLOCK>
+
+// writeS = (c*32*36 + (tid & 31)*4 + 32*36*2)*4
+--:-:-:-:1      ISCADD writeS, tid31, 4x<32*36*2>, 4;
+--:-:-:-:1      XMAD   writeS, c, 4x<32*36>, writeS;
+
+--:-:-:-:1      STS.128 [writeS], RZ;
+
+// offset = c*32*36 + tid31*4
+--:-:-:-:1      SHL tid31, tid31, 2;
+--:-:-:-:1      XMAD offset, c, 1x<32*36>, tid31;
+
+// (kBlks,C,6,6,32)
+// offset += (idx_K*C*32*36) * itemsize;
+--:-:-:-:1      XMAD.LO2C offset, idx_K, param_C_1152, offset;
+--:-:-:-:1      LEA      track0.CC, offset, param_F[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X track1,    offset, param_F[1], RZ, [+ dshift() +];
+
+--:-:-:-:1      XMAD partialC,  partialC, 1x<32*36 * $dsize>, RZ;
+
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F0, [track + 4x<0*32 * $dsize>];
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F1, [track + 4x<1*32 * $dsize>];
+--:-:2:-:1 @!P6 LDG.E.[+ vsize() +] F2, [track + 4x<2*32 * $dsize>];
+
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F0, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F1, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F2, [addr_zero];
+
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F3, [track + 4x<3*32 * $dsize>];
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F4, [track + 4x<4*32 * $dsize>];
+--:-:3:-:1 @!P6 LDG.E.[+ vsize() +] F5, [track + 4x<5*32 * $dsize>];
+
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F3, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F4, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F5, [addr_zero];
+
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F6, [track + 4x<6*32 * $dsize>];
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F7, [track + 4x<7*32 * $dsize>];
+--:6:4:-:1 @!P6 LDG.E.[+ vsize() +] F8, [track + 4x<8*32 * $dsize>];
+
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F6, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F7, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F8, [addr_zero];
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+20:-:-:-:0      IADD   track0.CC, track0, -partialC;
+
+--:-:-:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:1:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+--:-:-:-:1      IADD   writeS,    writeS,  swapBuf;
+--:-:-:-:2      IADD   swapBuf,       RZ, -swapBuf;
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:5      BRA.U FILTER_LOOP;
+
+##############################################################
+
+COMPUTE_SETUP:
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+--:-:-:-:1      IADD tid128, tid, -128;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,   1;
+
+--:-:-:-:1      BFE.U32  readIs, tid128, 0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
+--:-:-:-:1      ISCADD   readIs, readIs, 4x<32*4 + 32*36*2*2>, 4;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
+--:-:-:-:1      LOP.AND  readFs, tid128, 8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      IADD3    readFs, readFs, tid16, tid_1;
+--:-:-:-:0      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2*3>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+// Let Load loop run once to transform initial load and store to shared.
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>];
+--:-:1:-:2      LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+COMPUTE_LOOP:
+[+
+    my %insert = (
+
+        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -2;\n",
+
+        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        my $odd    = $j;
+        my $nOdd   = 1 - $j;
+        my $rsPred = $j == 1 ? '@P0' : '   ';
+        my $bar    = $j == 0 ? '2' : '-';
+
+        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;
+
+        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;
+
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            #$stall = '4' if $stall && $c % 2 == 0 && $j == 0 && $c > 16;
+
+            my $yield  = ($c % 5 == 0) && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+
+IMAGE_LOOP:
+--:-:-:-:1      ISETP.GT.AND P6, PT, C,  2, PT;
+[+
+    our ($dtype, $dsize, $convert_in, $W, $N);
+    my %insert = (
+
+        j0c0  => "--:-:-:-:1      ISETP.GT.AND P5, PT, C, RZ, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -2;\n",
+
+        $convert_in ? (
+            j0c1  => "02:-:2:-:1      F2F.F32.F16 i00, i00;\n",
+            j0c2  => "04:-:3:-:1      F2F.F32.F16 i10, i10;\n",
+            j0c3  => "08:-:4:-:1      F2F.F32.F16 i01, i01;\n",
+            j0c4  => "10:-:5:-:1      F2F.F32.F16 i11, i11;\n",
+        ) : (),
+
+        j0c5  => "02:-:-:-:1      STS [writeS + 4x<32*(0*6 + 0)>], I00;\n",
+        j0c6  => "04:-:-:-:1      STS [writeS + 4x<32*(5*6 + 0)>], I50;\n",
+
+        j0c7  => "--:-:-:-:1      FFMA TI10, i10,  0.75, i00;\n" .
+                 "--:-:-:-:1      FFMA TI20, i10, -0.75, i00;\n" .
+                 "--:-:-:-:1      FFMA TI30, i10,  1.50, i00;\n" .
+                 "--:-:-:-:1      FFMA TI40, i10, -1.50, i00;\n" .
+                 "--:-:-:-:1      IADD track0.CC, track0, param_HWN2p;\n" .
+                 "--:-:-:-:1 @!P6 MOV preds, RZ;\n",
+
+        j0c8  => "08:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 5)>], I05;\n",
+        j0c9  => "10:6:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 5)>], I55;\n",
+
+        j0c10 => "--:-:-:-:0      FFMA TI11, i11,  0.75, i01;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 0)>], I10;\n" .
+                 "--:-:-:-:0      FFMA TI21, i11, -0.75, i01;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 0)>], I20;\n" .
+                 "--:-:-:-:0      FFMA TI31, i11,  1.50, i01;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 0)>], I30;\n" .
+                 "--:-:-:-:0      FFMA TI41, i11, -1.50, i01;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 0)>], I40;\n" .
+                 "--:-:-:-:1      R2P PR, preds, 0xf;\n" .
+                 "--:-:-:-:1      IADD.X track1, track1, RZ;\n",
+
+        j0c11 => "--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n",
+        j0c13 => "--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n",
+        j0c19 => "--:-:1:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+
+        j0c14 => "--:-:-:-:0      FFMA I01, TI01,  0.75, TI00;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 5)>], I15;\n" .
+                 "--:-:-:-:0      FFMA I02, TI01, -0.75, TI00;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 5)>], I25;\n" .
+                 "--:-:-:-:0      FFMA I03, TI01,  1.50, TI00;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 5)>], I35;\n" .
+                 "--:-:-:-:0      FFMA I04, TI01, -1.50, TI00;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 5)>], I45;\n",
+
+        j0c15 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 1)>], I01;\n",
+        j0c16 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 2)>], I02;\n",
+        j0c17 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 3)>], I03;\n",
+        j0c18 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 4)>], I04;\n",
+
+        j0c20 => "--:-:-:-:1      FFMA I51, TI51,  0.75, TI50;\n" .
+                 "--:-:-:-:1      FFMA I52, TI51, -0.75, TI50;\n" .
+                 "--:-:-:-:1      FFMA I53, TI51,  1.50, TI50;\n" .
+                 "--:-:-:-:1      FFMA I54, TI51, -1.50, TI50;\n",
+
+        j0c21 => "20:-:2:-:1  \@P0 LDG.E.CI.$dtype i00, [track + ${dsize}x<0*$W*$N + 0*$N>];\n",
+        j0c22 => "--:-:3:-:1  \@P2 LDG.E.CI.$dtype i10, [track + ${dsize}x<1*$W*$N + 0*$N>];\n",
+        j0c23 => "--:-:4:-:1  \@P1 LDG.E.CI.$dtype i01, [track + ${dsize}x<0*$W*$N + 1*$N>];\n",
+        j0c24 => "--:-:5:-:1  \@P3 LDG.E.CI.$dtype i11, [track + ${dsize}x<1*$W*$N + 1*$N>];\n",
+
+        j0c25 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 1)>], I51;\n",
+        j0c26 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 2)>], I52;\n",
+        j0c27 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 3)>], I53;\n",
+        j0c28 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 4)>], I54;\n",
+
+        j0c29 => "--:-:-:-:1      FFMA I11, TI11,  0.75, TI10;\n" .
+                 "--:-:-:-:1      FFMA I12, TI11, -0.75, TI10;\n" .
+                 "--:-:-:-:1      FFMA I13, TI11,  1.50, TI10;\n" .
+                 "--:-:-:-:1      FFMA I14, TI11, -1.50, TI10;\n",
+
+        j0c30 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 1)>], I11;\n",
+        j0c31 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 2)>], I12;\n",
+        j1c0  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 3)>], I13;\n",
+        j1c1  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 4)>], I14;\n",
+
+        j1c2  => "--:-:-:-:1      FFMA I21, TI21,  0.75, TI20;\n" .
+                 "--:-:-:-:1      FFMA I22, TI21, -0.75, TI20;\n" .
+                 "--:-:-:-:1      FFMA I23, TI21,  1.50, TI20;\n" .
+                 "--:-:-:-:1      FFMA I24, TI21, -1.50, TI20;\n",
+
+        j1c3  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 1)>], I21;\n",
+        j1c4  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 2)>], I22;\n",
+        j1c5  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 3)>], I23;\n",
+        j1c6  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 4)>], I24;\n",
+
+        j1c7  => "--:-:-:-:1      FFMA I31, TI31,  0.75, TI30;\n" .
+                 "--:-:-:-:1      FFMA I32, TI31, -0.75, TI30;\n" .
+                 "--:-:-:-:1      FFMA I33, TI31,  1.50, TI30;\n" .
+                 "--:-:-:-:1      FFMA I34, TI31, -1.50, TI30;\n",
+
+        j1c8  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 1)>], I31;\n",
+        j1c9  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 2)>], I32;\n",
+        j1c10 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 3)>], I33;\n",
+        j1c11 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 4)>], I34;\n",
+
+        j1c12 => "--:-:-:-:1      FFMA I41, TI41,  0.75, TI40;\n" .
+                 "--:-:-:-:1      FFMA I42, TI41, -0.75, TI40;\n" .
+                 "--:-:-:-:1      FFMA I43, TI41,  1.50, TI40;\n" .
+                 "--:-:-:-:1      FFMA I44, TI41, -1.50, TI40;\n",
+
+        j1c13 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 1)>], I41;\n",
+        j1c14 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 2)>], I42;\n",
+        j1c15 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 3)>], I43;\n",
+        j1c16 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 4)>], I44;\n",
+
+        j1c17 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P5 IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P5 IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P5 IADD writeS, writeS,  swapBuf;\n" .
+                 "--:-:-:-:1  \@P5 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c18 => "--:-:-:-:1  \@P5 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n",
+        j1c20 => "--:-:-:-:1  \@P5 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n",
+        j1c22 => "--:-:1:-:1  \@P5 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n",
+
+        j1c31 => "--:-:-:Y:5  \@P5 BRA.U IMAGE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U LOAD_FINISH;",
+
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "01" : '--';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+
+FILTER_LOOP:
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, RZ, PT;
+20:-:-:-:1      IADD track0.CC, track0, 1x<32*36*2 * $dsize>;
+--:-:-:-:1      ISETP.GT.AND P1, PT, C, 2, PT;
+--:-:-:-:1      IADD C, C, -2;
+[+
+    our ($vsize, $dsize, $convert_in);
+    my %insert = (
+
+        j0c3 => "--:-:-:-:1      IADD.X track1, track1, RZ;\n",
+
+        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n",
+        j0c2  => "--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n",
+        j0c18 => "--:-:1:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+
+        j1c12 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n",
+        j1c14 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n",
+        j1c16 => "--:-:1:-:1  \@P0 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n",
+
+        $convert_in ? (
+
+            j0c1  => "02:-:-:-:1      F2F.F32.F16 F03, F01.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F02, F01.H0;\n",
+            j0c4  => "--:-:-:-:1      F2F.F32.F16 F01, F00.H1;\n" .
+                     "--:-:2:-:1      F2F.F32.F16 F00, F00.H0;\n",
+
+            j0c5  => "--:-:-:-:1      F2F.F32.F16 F13, F11.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F12, F11.H0;\n",
+            j0c6  => "--:-:-:-:1      F2F.F32.F16 F11, F10.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 F10, F10.H0;\n",
+
+            j0c7  => "--:-:-:-:1      F2F.F32.F16 F23, F21.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F22, F21.H0;\n",
+            j0c8  => "--:-:-:-:1      F2F.F32.F16 F21, F20.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 F20, F20.H0;\n",
+
+            j0c9  => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*32*4>], F0;\n",
+            j0c10 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], F1;\n",
+            j0c11 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], F2;\n",
+
+            j0c13 => "02:-:-:-:1  \@P1 LDG.E.$vsize F0, [track + 4x<0*32 * $dsize>];\n",
+            j0c14 => "10:-:-:-:1  \@P1 LDG.E.$vsize F1, [track + 4x<1*32 * $dsize>];\n",
+            j0c15 => "20:-:2:-:1  \@P1 LDG.E.$vsize F2, [track + 4x<2*32 * $dsize>];\n",
+
+            j0c16 => "04:-:-:-:1      F2F.F32.F16 F33, F31.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F32, F31.H0;\n",
+            j0c17 => "--:-:-:-:1      F2F.F32.F16 F31, F30.H1;\n" .
+                     "--:-:3:-:1      F2F.F32.F16 F30, F30.H0;\n",
+
+            j0c19 => "--:-:-:-:1      F2F.F32.F16 F43, F41.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F42, F41.H0;\n",
+            j0c20 => "--:-:-:-:1      F2F.F32.F16 F41, F40.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 F40, F40.H0;\n",
+
+            j0c21 => "--:-:-:-:1      F2F.F32.F16 F53, F51.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F52, F51.H0;\n",
+            j0c22 => "--:-:-:-:1      F2F.F32.F16 F51, F50.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 F50, F50.H0;\n",
+
+            j0c23 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], F3;\n",
+            j0c24 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], F4;\n",
+            j0c25 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], F5;\n",
+
+            j0c27 => "04:-:-:-:1  \@P1 LDG.E.$vsize F3, [track + 4x<3*32 * $dsize>];\n",
+            j0c28 => "10:-:-:-:1  \@P1 LDG.E.$vsize F4, [track + 4x<4*32 * $dsize>];\n",
+            j0c29 => "20:-:3:-:1  \@P1 LDG.E.$vsize F5, [track + 4x<5*32 * $dsize>];\n",
+
+            j0c30 => "08:-:-:-:1      F2F.F32.F16 F63, F61.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F62, F61.H0;\n",
+            j0c31 => "--:-:-:-:1      F2F.F32.F16 F61, F60.H1;\n" .
+                     "--:-:4:-:1      F2F.F32.F16 F60, F60.H0;\n",
+
+            j1c0  => "--:-:-:-:1      F2F.F32.F16 F73, F71.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F72, F71.H0;\n",
+            j1c1  => "--:-:-:-:1      F2F.F32.F16 F71, F70.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 F70, F70.H0;\n",
+
+            j1c2  => "--:-:-:-:1      F2F.F32.F16 F83, F81.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F82, F81.H0;\n",
+            j1c3  => "--:-:-:-:1      F2F.F32.F16 F81, F80.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 F80, F80.H0;\n",
+
+            j1c4  => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], F6;\n",
+            j1c5  => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], F7;\n",
+            j1c6  => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], F8;\n",
+
+            j1c8  => "08:-:-:-:1  \@P1 LDG.E.$vsize F6, [track + 4x<6*32 * $dsize>];\n",
+            j1c9  => "10:-:-:-:1  \@P1 LDG.E.$vsize F7, [track + 4x<7*32 * $dsize>];\n",
+            j1c10 => "20:6:4:-:1  \@P1 LDG.E.$vsize F8, [track + 4x<8*32 * $dsize>];\n",
+
+        ) : (
+
+            j0c6  => "02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], F0;\n",
+            j0c8  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], F1;\n",
+            j0c10 => "--:2:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], F2;\n",
+
+            j0c12 => "02:-:-:-:1  \@P1 LDG.E.$vsize F0, [track + 4x<0*32 * $dsize>];\n",
+            j0c14 => "--:-:-:-:1  \@P1 LDG.E.$vsize F1, [track + 4x<1*32 * $dsize>];\n",
+            j0c16 => "--:-:2:-:1  \@P1 LDG.E.$vsize F2, [track + 4x<2*32 * $dsize>];\n",
+
+            j0c20 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], F3;\n",
+            j0c22 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], F4;\n",
+            j0c24 => "--:3:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], F5;\n",
+
+            j0c26 => "04:-:-:-:1  \@P1 LDG.E.$vsize F3, [track + 4x<3*32 * $dsize>];\n",
+            j0c28 => "--:-:-:-:1  \@P1 LDG.E.$vsize F4, [track + 4x<4*32 * $dsize>];\n",
+            j0c30 => "--:-:3:-:1  \@P1 LDG.E.$vsize F5, [track + 4x<5*32 * $dsize>];\n",
+
+            j1c0  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], F6;\n",
+            j1c2  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], F7;\n",
+            j1c4  => "--:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], F8;\n",
+
+            j1c6  => "08:-:-:-:1  \@P1 LDG.E.$vsize F6, [track + 4x<6*32 * $dsize>];\n",
+            j1c8  => "--:-:-:-:1  \@P1 LDG.E.$vsize F7, [track + 4x<7*32 * $dsize>];\n",
+            j1c10 => "--:6:4:-:1  \@P1 LDG.E.$vsize F8, [track + 4x<8*32 * $dsize>];\n",
+        ),
+
+        j1c11 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c31 => "--:-:-:Y:5  \@P0 BRA.U FILTER_LOOP;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "01" : '--';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+LOAD_FINISH:
+
+//--:-:-:-:5      EXIT;
+
+
+--:-:1:-:2      S2R Tid, SR_TID.X;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha16, param_alpha;
+
+01:-:-:-:1      LOP.AND  Tid32_2,  Tid,    -32;
+--:-:-:-:1      SHR.U32  Tid32_2,  Tid32_2, 2;
+
+// readFs = ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND Tid1,   Tid,    1;
+01:-:-:-:1      LOP.AND readFs, Tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD    readFs, readFs, Tid1;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readFs << 2)
+--:-:-:-:1      BFE.U32 readIs, Tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, Tid32_2;
+--:-:-:-:1      ISCADD  readIs, readFs, readIs, 2;
+
+--:-:-:-:1      SHL readIs, readIs, 4;
+--:-:-:-:1      SHL readFs, readFs, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:1      XMAD write16Cs, readFs, 1x<32*36>, readIs;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y2, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y2, alpha16;
+--:-:-:-:4      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y3, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y3, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y6, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y6, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y7, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y7, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      EXIT;
+
+
+COMPUTE_FINISH:
+
+//--:-:-:-:5      EXIT;
+
+
+--:-:1:-:2      S2R tid_128, SR_TID.X;
+<SCHEDULE_BLOCK>
+
+01:-:-:-:1      IADD tid_128, tid_128, -128;
+
+--:-:-:-:1      ISETP.GE.AND P6, PT, tid_128, 256, PT;
+
+// readFs = ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  Tid_1,   tid_128, 1;
+--:-:-:-:1      LOP.AND  readFs2, tid_128, 8;
+--:-:-:-:1      SHR.U32  readFs2, readFs2, 2;
+--:-:-:-:1      IADD     readFs2, readFs2, Tid_1;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2)
+--:-:-:-:1      LOP.AND  tid_16,   tid_128, -16;
+--:-:-:-:1      SHR.U32  tid_16,   tid_16,   1;
+--:-:-:-:1      BFE.U32  readIs2,  tid_128,  0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs2,  readIs2,  tid_16;
+--:-:-:-:1      ISCADD   readIs2,  readFs2, readIs2, 2;
+
+--:-:-:-:1      ISCADD   readIs2, readIs2, 4x<32*4>, 4;
+--:-:-:-:1      SHL      readFs2, readFs2, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:0      XMAD writeCs, readFs2, 1x<32*36>, readIs2;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U SKIP0;
+
+--:-:2:-:1      LDS idxX, [addr_idx_X];
+--:-:3:-:1      LDS idxY, [addr_idx_Y];
+--:-:1:-:1      S2R idxN,  SR_CTAID.Z;
+--:-:4:-:1      LDS idxK, [addr_idx_K];
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND tid_31, tid_128, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid_128,  5;
+--:-:-:-:1      SHR.U32 tid_64, tid_128,  6;
+
+
+// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16
+--:-:-:-:1      XMAD   readCs, tid_32, 1x<32*36>, tid_31;
+--:-:-:-:1      ISCADD readCs, tid_64, readCs, 4;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// Superblock offset
+// idxX <<= shiftX
+// idxX <<= shiftY
+04:-:-:-:1      SHL idxY, idxY, param_shiftY;
+02:-:-:-:1      SHL idxX, idxX, param_shiftX;
+01:-:-:-:1      SHL idxN, idxN, param_shiftN;
+
+// Get this threads offset within the superblock
+--:-:-:-:1      BFE.U32 p, tid_31, param_SuperY;
+--:-:-:-:1      BFE.U32 q, tid_31, param_SuperX;
+--:-:-:-:1      LOP.AND n, tid_31, param_SuperN;
+
+--:-:-:-:1      ISCADD q, q, idxX, 1;
+--:-:-:-:1      ISCADD p, p, idxY, 1;
+
+--:-:-:-:1      MOV four, -4;
+--:-:-:-:1      IADD3 q, q, param_pad_x, four;
+--:-:-:-:1      IADD3 p, p, param_pad_y, four;
+
+[+
+    our ($type, $N);
+    if ($type eq 'h')
+    {
+        return q{
+--:-:-:-:1      SHL tid31_4, tid_31, 2;
+
+--:-:-:-:1      ISCADD n, n, idxN, 1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tid_31, 16, PT;
+        }
+    }
+    else {
+        return q{
+--:-:-:-:1      IADD n, n, idxN;
+--:-:-:-:1      ISETP.LT.AND P6, PT, n, param_N, PT;
+        };
+    }
++]
+
+// k = idxK*32 + tid_32<<1
+--:-:-:-:1      SHL tid_32, tid_32,   1;
+08:-:-:-:1      ISCADD k, idxK, tid_32, 5;
+
+// Out = k*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD.S16.U16      offsetO, q, param_N,    n;
+--:-:-:-:1      XMAD.S16.U16.LO2C offsetO, p, param_QN,   offsetO;
+--:-:-:-:1      XMAD.U16.U16.LO2C offsetO, k, param_PQN,  offsetO;
+--:-:-:-:1      ISET.LT.AND sign, offsetO, RZ, PT;
+
+--:-:-:-:1      LEA    Out0.CC, offsetO, param_O[0], [+ dshift() +];
+--:-:-:-:1      IADD.X Out1,    sign,    param_O[1];
+
+--:-:-:-:1      ISETP.EQ.AND P5, PT, RZ, param_flags, PT; // ! no-op
+
+--:-:-:-:1      IADD z1, q, 1;
+--:-:-:-:1      IADD z2, q, 2;
+--:-:-:-:1      IADD z3, q, 3;
+--:-:-:-:1      IADD z4, q, 4;
+--:-:-:-:1      IADD z5, q, 5;
+--:-:-:-:1      ISETP.LT.AND P0, PT, q,  param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, z4, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P5, PT, z5, param_Q, P5;
+--:-:-:-:1      ISETP.GE.AND P0, PT, q,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, z4, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, z5, RZ, P5;
+--:-:-:-:1      P2R mask_q, PR, RZ, 0x3f;
+
+--:-:-:-:1      IADD z1, p, 1;
+--:-:-:-:1      IADD z2, p, 2;
+--:-:-:-:1      IADD z3, p, 3;
+--:-:-:-:1      IADD z4, p, 4;
+--:-:-:-:1      IADD z5, p, 5;
+--:-:-:-:1      ISETP.LT.AND P0, PT, p,  param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P4, PT, z4, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, z5, param_P, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, p,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, z4, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, z5, RZ, P5;
+
+--:-:-:-:1      SEL pred30, mask_q, RZ, P0;
+--:-:-:-:1  @P1 BFI pred30, mask_q, 0x606, pred30;
+--:-:-:-:1  @P2 BFI pred30, mask_q, 0x60c, pred30;
+--:-:-:-:1  @P3 BFI pred30, mask_q, 0x612, pred30;
+--:-:-:-:1  @P4 BFI pred30, mask_q, 0x618, pred30;
+--:-:-:-:1      SEL pred36, mask_q, RZ, P5;
+
+--:-:-:-:1      ISETP.GE.AND P6, PT, tid_128, 256, PT;
+
+</SCHEDULE_BLOCK>
+
+SKIP0:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, ccx7y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, ccx3y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, ccx7y2, param_alpha;
+
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P6 BRA.U SKIP1;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP1:
+
+--:-:-:-:0      IADD k, k, 1;
+--:-:-:-:5      BAR.SYNC 0;
+01:-:-:-:1      IADD Out0.CC, Out0, param_PQNp;
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y1, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y1, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y3, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y3, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y3, param_alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y3, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y3, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y3, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y3, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y3, param_alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:0      IADD.X Out1, Out1, RZ;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P6 BRA.U SKIP2;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP2:
+
+--:-:-:-:0      IADD k, k, 15;
+--:-:-:-:5      BAR.SYNC 0;
+01:-:-:-:1      IADD Out0.CC, Out0, param_PQN15p;
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y4, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y4, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y6, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y6, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y6, param_alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y6, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y6, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y6, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y6, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y6, param_alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:0      IADD.X Out1, Out1, RZ;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P6 BRA.U SKIP3;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP3:
+
+--:-:-:-:0      IADD k, k, 1;
+--:-:-:-:5      BAR.SYNC 0;
+01:-:-:-:1      IADD Out0.CC, Out0, param_PQNp;
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y5, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y5, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y7, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y7, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y7, param_alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y7, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y7, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y7, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y7, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y7, param_alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:0      IADD.X Out1, Out1, RZ;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P6 BRA.U SKIP4;S
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP4:
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K, PT;
+--:-:-:-:1 @!P0 MOV pred30, RZ;
+--:-:-:-:1 @!P0 MOV pred36, RZ;
+[+
+    my $out;
+    foreach my $i (0 .. 2)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $i + 1;
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+// t0 = I[1,:] + I[2,:]
+// t1 = I[1,:] - I[2,:]
+// t2 = I[3,:] + I[4,:]
+// t3 = I[3,:] - I[4,:]
+// O[2,:] = t0 * -2.25   + t2 * -0.5625  + I[0,:] * -2.8125
+// O[1,:] = t1 * -1.6875 + t3 * -0.84375 + I[5,:] *  1.265625
+// O[3,:] = t1 *  0.75   + t3 *  1.5     + I[5,:] * -2.8125
+// O[4,:] = I[0,:] + t0  + t2
+// O[0,:] = I[0,:] * 1.265625
+// O[5,:] = I[5,:]
+[+
+    my $out;
+    foreach my $i (0 .. 2)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
+--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD t2$i, m3$i,  m4$i;
+--:-:-:-:1      FADD t3$i, m3$i, -m4$i;
+--:-:-:-:1      FMUL w2$i, m0$i, -2.8125;
+--:-:-:-:1      FFMA w2$i, t0$i, -2.25,    w2$i;
+--:-:-:-:1      FFMA w2$i, t2$i, -0.5625,  w2$i;
+--:-:-:-:1      FMUL w1$i, m5$i,  1.265625;
+--:-:-:-:1      FFMA w1$i, t1$i, -1.6875,  w1$i;
+--:-:-:-:1      FFMA w1$i, t3$i, -0.84375, w1$i;
+--:-:-:-:1      FMUL w3$i, m5$i, -2.8125;
+--:-:-:-:1      FFMA w3$i, t1$i,  0.75,    w3$i;
+--:-:-:-:1      FFMA w3$i, t3$i,  1.5,     w3$i;
+--:-:-:-:1      FADD w4$i, m0$i,  t0$i;
+--:-:-:-:1      FADD w4$i, w4$i,  t2$i;
+--:-:-:-:1      FMUL w0$i, m0$i,  1.265625;
+        };
+    }
+    foreach my $i (3 .. 5)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $i + 1;
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (3 .. 5)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
+--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD t2$i, m3$i,  m4$i;
+--:-:-:-:1      FADD t3$i, m3$i, -m4$i;
+--:-:-:-:1      FMUL w2$i, m0$i, -2.8125;
+--:-:-:-:1      FFMA w2$i, t0$i, -2.25,    w2$i;
+--:-:-:-:1      FFMA w2$i, t2$i, -0.5625,  w2$i;
+--:-:-:-:1      FMUL w1$i, m5$i,  1.265625;
+--:-:-:-:1      FFMA w1$i, t1$i, -1.6875,  w1$i;
+--:-:-:-:1      FFMA w1$i, t3$i, -0.84375, w1$i;
+--:-:-:-:1      FMUL w3$i, m5$i, -2.8125;
+--:-:-:-:1      FFMA w3$i, t1$i,  0.75,    w3$i;
+--:-:-:-:1      FFMA w3$i, t3$i,  1.5,     w3$i;
+--:-:-:-:1      FADD w4$i, m0$i,  t0$i;
+--:-:-:-:1      FADD w4$i, w4$i,  t2$i;
+--:-:-:-:1      FMUL w0$i, m0$i,  1.265625;
+        };
+    }
+    return $out;
++]
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        $out .= qq{
+--:-:-:-:1      FADD r${i}0, w${i}1,  w${i}2;
+--:-:-:-:1      FADD r${i}1, w${i}1, -w${i}2;
+--:-:-:-:1      FADD r${i}2, w${i}3,  w${i}4;
+--:-:-:-:1      FADD r${i}3, w${i}3, -w${i}4;
+--:-:-:-:1      FMUL s${i}2, w${i}0, -2.8125;
+--:-:-:-:1      FFMA s${i}2, r${i}0, -2.25,    s${i}2;
+--:-:-:-:1      FFMA s${i}2, r${i}2, -0.5625,  s${i}2;
+--:-:-:-:1      FMUL s${i}1, w${i}5,  1.265625;
+--:-:-:-:1      FFMA s${i}1, r${i}1, -1.6875,  s${i}1;
+--:-:-:-:1      FFMA s${i}1, r${i}3, -0.84375, s${i}1;
+--:-:-:-:1      FMUL s${i}3, w${i}5, -2.8125;
+--:-:-:-:1      FFMA s${i}3, r${i}1,  0.75,    s${i}3;
+--:-:-:-:1      FFMA s${i}3, r${i}3,  1.5,     s${i}3;
+--:-:-:-:1      FADD s${i}4, w${i}0,  r${i}0;
+--:-:-:-:1      FADD s${i}4, s${i}4,  r${i}2;
+--:-:-:-:1      FMUL s${i}0, w${i}0,  1.265625;
+        };
+    }
+    return $out;
++]
+[+
+    our $type;
+    return $type eq 'h' ? q{
+
+--:-:-:-:1      IADD readCs, readCs, -tid31_4;
+--:-:-:-:1      SHR.U32 tid31_4, tid31_4, 1;
+--:-:-:-:1      IADD readCs, readCs, tid31_4;
+
+<ORDERED>
+--:-:-:-:1      F2F.F16.F32 s05, s05;
+--:-:-:-:1      F2F.F16.F32 s00, s00;
+--:-:-:-:1      F2F.F16.F32 s02, s02;
+--:-:-:-:1      F2F.F16.F32 s01, s01;
+--:-:-:-:1      F2F.F16.F32 s03, s03;
+--:-:1:-:1      F2F.F16.F32 s04, s04;
+
+--:-:-:-:1      F2F.F16.F32 s15, s15;
+--:-:-:-:1      F2F.F16.F32 s10, s10;
+--:-:-:-:1      F2F.F16.F32 s12, s12;
+--:-:-:-:1      F2F.F16.F32 s11, s11;
+--:-:-:-:1      F2F.F16.F32 s13, s13;
+--:-:2:-:1      F2F.F16.F32 s14, s14;
+
+01:-:-:-:1      STS.U16 [readCs + 4x<(0*6+0)*32>], s00;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+1)*32>], s01;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+2)*32>], s02;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+3)*32>], s03;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+4)*32>], s04;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+5)*32>], s05;
+
+--:-:-:-:1      F2F.F16.F32 s25, s25;
+--:-:-:-:1      F2F.F16.F32 s20, s20;
+--:-:-:-:1      F2F.F16.F32 s22, s22;
+--:-:-:-:1      F2F.F16.F32 s21, s21;
+--:-:-:-:1      F2F.F16.F32 s23, s23;
+--:-:3:-:1      F2F.F16.F32 s24, s24;
+
+02:-:-:-:1      STS.U16 [readCs + 4x<(1*6+0)*32>], s10;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+1)*32>], s11;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+2)*32>], s12;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+3)*32>], s13;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+4)*32>], s14;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+5)*32>], s15;
+
+--:-:-:-:1      F2F.F16.F32 s35, s35;
+--:-:-:-:1      F2F.F16.F32 s30, s30;
+--:-:-:-:1      F2F.F16.F32 s32, s32;
+--:-:-:-:1      F2F.F16.F32 s31, s31;
+--:-:-:-:1      F2F.F16.F32 s33, s33;
+--:-:4:-:1      F2F.F16.F32 s34, s34;
+
+04:-:-:-:1      STS.U16 [readCs + 4x<(2*6+0)*32>], s20;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+1)*32>], s21;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+2)*32>], s22;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+3)*32>], s23;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+4)*32>], s24;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+5)*32>], s25;
+
+--:-:-:-:1      F2F.F16.F32 s45, s45;
+--:-:-:-:1      F2F.F16.F32 s40, s40;
+--:-:-:-:1      F2F.F16.F32 s42, s42;
+--:-:-:-:1      F2F.F16.F32 s41, s41;
+--:-:-:-:1      F2F.F16.F32 s43, s43;
+--:-:5:-:1      F2F.F16.F32 s44, s44;
+
+08:-:-:-:1      STS.U16 [readCs + 4x<(3*6+0)*32>], s30;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+1)*32>], s31;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+2)*32>], s32;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+3)*32>], s33;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+4)*32>], s34;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+5)*32>], s35;
+
+--:-:-:-:1      F2F.F16.F32 s55, s55;
+--:-:-:-:1      F2F.F16.F32 s50, s50;
+--:-:-:-:1      F2F.F16.F32 s52, s52;
+--:-:-:-:1      F2F.F16.F32 s51, s51;
+--:-:-:-:1      F2F.F16.F32 s53, s53;
+--:-:6:-:1      F2F.F16.F32 s54, s54;
+
+10:-:-:-:1      STS.U16 [readCs + 4x<(4*6+0)*32>], s40;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+1)*32>], s41;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+2)*32>], s42;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+3)*32>], s43;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+4)*32>], s44;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+5)*32>], s45;
+
+20:-:-:-:1      STS.U16 [readCs + 4x<(5*6+0)*32>], s50;
+--:-:-:-:1      STS.U16 [readCs + 4x<(5*6+1)*32>], s51;
+--:-:-:-:1      STS.U16 [readCs + 4x<(5*6+2)*32>], s52;
+--:-:-:-:1      STS.U16 [readCs + 4x<(5*6+3)*32>], s53;
+--:-:-:-:1      STS.U16 [readCs + 4x<(5*6+4)*32>], s54;
+--:1:-:-:2      STS.U16 [readCs + 4x<(5*6+5)*32>], s55; // FORCE
+</ORDERED>
+
+01:-:-:-:1      IADD readCs, readCs, -tid31_4;
+--:-:-:-:1      SHL tid31_4, tid31_4, 1;
+--:-:-:-:4      IADD readCs, readCs, tid31_4;
+
+    } : q{
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 0*$N>], s00;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 1*$N>], s01;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 2*$N>], s02;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 3*$N>], s03;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 4*$N>], s04;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 5*$N>], s05;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 0*$N>], s10;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 1*$N>], s11;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 2*$N>], s12;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 3*$N>], s13;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 4*$N>], s14;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 5*$N>], s15;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 0*$N>], s20;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 1*$N>], s21;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 2*$N>], s22;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 3*$N>], s23;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 4*$N>], s24;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 5*$N>], s25;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 0*$N>], s30;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 1*$N>], s31;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 2*$N>], s32;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 3*$N>], s33;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 4*$N>], s34;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 5*$N>], s35;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.L.U64 pred30, pred30, 24, pred30;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 0*$N>], s40;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 1*$N>], s41;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 2*$N>], s42;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 3*$N>], s43;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 4*$N>], s44;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 5*$N>], s45;
+--:-:-:-:1      R2P PR, pred36, 0x3f;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 0*$N>], s50;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 1*$N>], s51;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 2*$N>], s52;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 3*$N>], s53;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 4*$N>], s54;
+--:1:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 5*$N>], s55;
+    };
++]
+</SCHEDULE_BLOCK>
+
+[+
+    our $type;
+    return $type eq 'h' ? q{
+--:-:-:-:1      LDS.U.32 s00, [readCs + 4x<(0*6+0)*32>];
+--:-:-:-:1      LDS.U.32 s01, [readCs + 4x<(0*6+1)*32>];
+--:-:-:-:1      LDS.U.32 s02, [readCs + 4x<(0*6+2)*32>];
+--:-:-:-:1      LDS.U.32 s03, [readCs + 4x<(0*6+3)*32>];
+--:-:-:-:1      LDS.U.32 s04, [readCs + 4x<(0*6+4)*32>];
+--:-:1:-:1      LDS.U.32 s05, [readCs + 4x<(0*6+5)*32>];
+
+--:-:-:-:1      LDS.U.32 s10, [readCs + 4x<(1*6+0)*32>];
+--:-:-:-:1      LDS.U.32 s11, [readCs + 4x<(1*6+1)*32>];
+--:-:-:-:1      LDS.U.32 s12, [readCs + 4x<(1*6+2)*32>];
+--:-:-:-:1      LDS.U.32 s13, [readCs + 4x<(1*6+3)*32>];
+--:-:-:-:1      LDS.U.32 s14, [readCs + 4x<(1*6+4)*32>];
+--:-:2:-:1      LDS.U.32 s15, [readCs + 4x<(1*6+5)*32>];
+
+--:-:-:-:1      LDS.U.32 s20, [readCs + 4x<(2*6+0)*32>];
+--:-:-:-:1      LDS.U.32 s21, [readCs + 4x<(2*6+1)*32>];
+--:-:-:-:1      LDS.U.32 s22, [readCs + 4x<(2*6+2)*32>];
+--:-:-:-:1      LDS.U.32 s23, [readCs + 4x<(2*6+3)*32>];
+--:-:-:-:1      LDS.U.32 s24, [readCs + 4x<(2*6+4)*32>];
+--:-:3:-:1      LDS.U.32 s25, [readCs + 4x<(2*6+5)*32>];
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+01:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 0*$N>], s00;
+--:-:-:-:1      LDS.U.32 s30, [readCs + 4x<(3*6+0)*32>];
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 1*$N>], s01;
+--:-:-:-:1      LDS.U.32 s31, [readCs + 4x<(3*6+1)*32>];
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 2*$N>], s02;
+--:-:-:-:1      LDS.U.32 s32, [readCs + 4x<(3*6+2)*32>];
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 3*$N>], s03;
+--:-:-:-:1      LDS.U.32 s33, [readCs + 4x<(3*6+3)*32>];
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 4*$N>], s04;
+--:-:-:-:1      LDS.U.32 s34, [readCs + 4x<(3*6+4)*32>];
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 5*$N>], s05;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:4:-:1      LDS.U.32 s35, [readCs + 4x<(3*6+5)*32>];
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1      LDS.U.32 s40, [readCs + 4x<(4*6+0)*32>];
+02:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 0*$N>], s10;
+--:-:-:-:1      LDS.U.32 s41, [readCs + 4x<(4*6+1)*32>];
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 1*$N>], s11;
+--:-:-:-:1      LDS.U.32 s42, [readCs + 4x<(4*6+2)*32>];
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 2*$N>], s12;
+--:-:-:-:1      LDS.U.32 s43, [readCs + 4x<(4*6+3)*32>];
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 3*$N>], s13;
+--:-:-:-:1      LDS.U.32 s44, [readCs + 4x<(4*6+4)*32>];
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 4*$N>], s14;
+--:-:5:-:1      LDS.U.32 s45, [readCs + 4x<(4*6+5)*32>];
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 5*$N>], s15;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      LDS.U.32 s50, [readCs + 4x<(5*6+0)*32>];
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1      LDS.U.32 s51, [readCs + 4x<(5*6+1)*32>];
+04:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 0*$N>], s20;
+--:-:-:-:1      LDS.U.32 s52, [readCs + 4x<(5*6+2)*32>];
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 1*$N>], s21;
+--:-:-:-:1      LDS.U.32 s53, [readCs + 4x<(5*6+3)*32>];
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 2*$N>], s22;
+--:-:-:-:1      LDS.U.32 s54, [readCs + 4x<(5*6+4)*32>];
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 3*$N>], s23;
+--:-:6:-:1      LDS.U.32 s55, [readCs + 4x<(5*6+5)*32>];
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 4*$N>], s24;
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 5*$N>], s25;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+
+08:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 0*$N>], s30;
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 1*$N>], s31;
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 2*$N>], s32;
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 3*$N>], s33;
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 4*$N>], s34;
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 5*$N>], s35;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.L.U64 pred30, pred30, 24, pred30;
+10:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 0*$N>], s40;
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 1*$N>], s41;
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 2*$N>], s42;
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 3*$N>], s43;
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 4*$N>], s44;
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 5*$N>], s45;
+--:-:-:-:1      R2P PR, pred36, 0x3f;
+20:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 0*$N>], s50;
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 1*$N>], s51;
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 2*$N>], s52;
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 3*$N>], s53;
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 4*$N>], s54;
+--:1:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 5*$N>], s55;
+</ORDERED>
+</SCHEDULE_BLOCK>
+    } : '';
++]
+
+--:-:-:-:5      RET;
+
+// RED.E.ADD.F16x2.FTZ.RN
\ No newline at end of file
diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_3x3_2x2_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_3x3_2x2_32x32.sass
new file mode 100644
index 0000000..fe1dc07
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_winograd_3x3_2x2_32x32.sass
@@ -0,0 +1,1814 @@
+
+# Copyright 2015 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our ($type, $IX, $D);
+our $determ = $D;
+our $dtype        = $type eq 'h' ?        '.U16' : '';
+our $convert_in   = $type eq 'h' ? 'F2F.F32.F16' : '';
+our $convert_out  = $type eq 'h' ? 'F2F.F16.F32' : '';
+our $vec_size     = $type eq 'h' ?          '64' : '128';
+our $dtype_shift  = $type eq 'h' ?           '1' : '2';
+our $dtype_size   = $type eq 'h' ?           '2' : '4';
+sub dtype       { return $dtype;       }
+sub dtype_shift { return $dtype_shift; }
+sub output_op   { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero  : 4x<(512*4 + 32)*4 + 0>
+    addr_blk_K : 4x<(512*4 + 32)*4 + 4>
+    addr_blk_C : 4x<(512*4 + 32)*4 + 5>
+    addr_blk_P : 4x<(512*4 + 32)*4 + 6>
+    addr_blk_Q : 4x<(512*4 + 32)*4 + 7>
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_Y            : c[0x0][0x15c]
+    param_X            : c[0x0][0x160]
+    param_P            : c[0x0][0x164]
+    param_Q            : c[0x0][0x168]
+    param_C            : c[0x0][0x16c]
+    param_K            : c[0x0][0x170]
+    param_N            : c[0x0][0x174]
+    param_pad_y        : c[0x0][0x178]
+    param_pad_x        : c[0x0][0x17c]
+    param_GY           : c[0x0][0x180]
+    param_GX           : c[0x0][0x184]
+    param_GYS          : c[0x0][0x188]
+    param_GXS          : c[0x0][0x18c]
+    param_shiftYI      : c[0x0][0x190]
+    param_shiftXI      : c[0x0][0x194]
+    param_superYI      : c[0x0][0x198]
+    param_superXI      : c[0x0][0x19c]
+    param_superNI      : c[0x0][0x1a0]
+    param_shiftY       : c[0x0][0x1a4]
+    param_shiftX       : c[0x0][0x1a8]
+    param_superY       : c[0x0][0x1ac]
+    param_superX       : c[0x0][0x1b0]
+    param_superN       : c[0x0][0x1b4]
+    param_loopXI       : c[0x0][0x1b8]
+    param_loopX        : c[0x0][0x1bc]
+    param_loopN        : c[0x0][0x1c0]
+    param_strideY      : c[0x0][0x1c4]
+    param_strideX      : c[0x0][0x1c8]
+    param_XN           : c[0x0][0x1cc]
+    param_YXN          : c[0x0][0x1d0]
+    param_QN           : c[0x0][0x1d4]
+    param_PQN          : c[0x0][0x1d8]
+    param_SK           : c[0x0][0x1dc]
+    param_RSK          : c[0x0][0x1e0]
+    param_Np           : c[0x0][0x1e4]
+    param_XNp          : c[0x0][0x1e8]
+    param_2XNp         : c[0x0][0x1ec]
+    param_QNp          : c[0x0][0x1f0]
+    param_CPQkc        : c[0x0][0x1f4]
+    param_PQkc         : c[0x0][0x1f8]
+    param_Qkc          : c[0x0][0x1fc]
+    param_kc           : c[0x0][0x200]
+    param_c            : c[0x0][0x204]
+    param_k            : c[0x0][0x208]
+    param_magic_CPQkc  : c[0x0][0x20c]
+    param_shift_CPQkc  : c[0x0][0x210]
+    param_magic_PQkc   : c[0x0][0x214]
+    param_shift_PQkc   : c[0x0][0x218]
+    param_magic_Qkc    : c[0x0][0x21c]
+    param_shift_Qkc    : c[0x0][0x220]
+    param_magic_kc     : c[0x0][0x224]
+    param_shift_kc     : c[0x0][0x228]
+    param_magic_c      : c[0x0][0x22c]
+    param_shift_c      : c[0x0][0x230]
+    param_CRSK         : c[0x0][0x234]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+
+      64-79 : j0Ex<0-7>, j0Iy<0-7>
+      80-95 : j1Ex<0-7>, j1Iy<0-7>
+
+      64-79 ~ blk_KCPQkc, blk_CPQkc, blk_PQkc, blk_Qkc, blk_kc, blk_k, blk_c, blk_K, blk_C, blk_P, magic_CPQkc, magic_PQkc, magic_Qkc
+      84-95 ~ div1, div2, div3, tidX, tidY, tid16, tid1, neg_CPQkc, neg_PQkc, neg_Qkc, neg_kc, neg_c
+
+      80-82 : init, tid, blk_Q
+         83 = blkC, blkK
+      84-95 ~ x, x<1-3>, y, super_x, super_y, tid_X, c, offsign, mask_x, mask_y
+      84-95 ~ nloop, N
+         81 = off_sign
+         64 = swapBuf
+
+     96-103 : track0<0-1>, track1<0-1>, track2<0-1>, track3<0-1>
+
+    120-127 ~ writeS, readEs, readIs, pred_bits, gys, gxs, n, offset
+
+       0-31 : m0<0-3>, m1<0-3>, m2<0-3>, m3<0-3>, t0<0-3>, t1<0-3>, t2<0-3>
+      64-72 : f0<0-2>, f1<0-2>, f2<0-2>
+      76-79 : blkKCPQ<0-3>
+      76-79 : K_blk, C_blk, P_blk, Q_blk
+      84-95 ~ CRSK, xmad_determ, PQ_blk
+     96-109 ~ alpha, writeCs, readCs, cc, RSK8, tid_1, tid_16, tid_31, tid_32, kk, trackF, K1, SK1
+    110-115 : F00_<0-1>, F01_<0-1>, F02_<0-1>,
+    116-121 : F10_<0-1>, F11_<0-1>, F12_<0-1>,
+    122-127 : F20_<0-1>, F21_<0-1>, F22_<0-1>
+[+
+    our $IX;
+    return $IX ? q{
+      96-99 : trackI<0-1>, offsetI<0-1>
+    100-103 ~ swapBuffer, gy, gx
+
+    104-119 : I0<0-3>, I1<0-3>, I2<0-3>, I3<0-3>
+    } : q{
+    // registers reorded to avoid bank conflicts
+    104 = y0x0, Y0X0, I00, Y1X0
+    105 = y0x1, Y0X1, I02, Y1X2
+    106 = y0x2, Y0X2, I13
+    107 = y0x3, Y0X3, I03, Y1X3
+    108 = y1x0, I04
+    110 = y1x1, I05
+    109 = y1x2, I06
+    111 = y1x3, I07
+    113 = y2x0, Y2X0, I08
+    112 = y2x1, Y2X1
+    119 = y2x2, Y2X2, I10
+    117 = y2x3, Y2X3, I11
+    115 = y3x0, Y3X0, I12
+    116 = y3x1, Y3X1, I14
+    114 = y3x2, Y3X2, I09
+    118 = y3x3, Y3X3, I15
+    80  = I01
+    64  = Y1X1
+    };
++]
+    // Error registers
+    104 = p0q0, E00
+    105 = p0q1, E03
+    106 = p1q0, E12
+    107 = p1q1, E15
+    108 = e0, C0, E08
+    109 = E01
+    110 = E02
+    111 = e1, C1, E11
+    112 = E13
+    113 = E14
+    114 = B0, E04
+    115 = B1, E07
+    116 = e2, E06
+    117 = e3, E10
+    118 = E05
+    119 = E09
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,        SR_TID.X;
+--:-:2:-:1      S2R blk_KCPQkc, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 128, PT;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+
+--:-:-:-:1      MOV  magic_CPQkc,    param_magic_CPQkc;
+--:-:-:-:1      MOV  magic_PQkc,     param_magic_PQkc;
+--:-:-:-:1      MOV  magic_Qkc,      param_magic_Qkc;
+--:-:-:-:1      IADD neg_CPQkc, RZ, -param_CPQkc;
+--:-:-:-:1      IADD neg_PQkc,  RZ, -param_PQkc;
+--:-:-:-:1      IADD neg_Qkc,   RZ, -param_Qkc;
+--:-:-:-:1      IADD neg_kc,    RZ, -param_kc;
+--:-:-:-:1      IADD neg_c,     RZ, -param_c;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magic_CPQkc, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magic_PQkc,  1, PT;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_Qkc,   1, PT;
+
+// blk_K = blk_KCPQkc / CPQkc
+02:-:-:-:1  @P1 XMAD     div1, blk_KCPQkc,    magic_CPQkc,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blk_KCPQkc,    magic_CPQkc.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blk_KCPQkc.H1, magic_CPQkc.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blk_KCPQkc.H1, magic_CPQkc,    div1;
+--:-:-:-:1  @P1 IADD3.RS blk_K, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  blk_K, blk_K,      param_shift_CPQkc;
+--:-:-:-:1 @!P1 SHR.U32  blk_K, blk_KCPQkc, param_shift_CPQkc;
+
+// blk_CPQkc = blk_KCPQkc % CPQkc
+--:-:-:-:1      XMAD.LO2 blk_CPQkc, neg_CPQkc, blk_K, blk_KCPQkc;
+
+// blk_C = blk_CPQkc / PQkc
+--:-:-:-:1  @P2 XMAD     div1, blk_CPQkc,    magic_PQkc,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, blk_CPQkc,    magic_PQkc.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, blk_CPQkc.H1, magic_PQkc.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, blk_CPQkc.H1, magic_PQkc,    div1;
+--:-:-:-:1  @P2 IADD3.RS blk_C, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  blk_C, blk_C,     param_shift_PQkc;
+--:-:-:-:1 @!P2 SHR.U32  blk_C, blk_CPQkc, param_shift_PQkc;
+
+// blk_PQkc = blk_CPQkc % PQkc
+--:-:-:-:1      XMAD.LO2 blk_PQkc, neg_PQkc, blk_C, blk_CPQkc;
+
+// blk_P = blk_PQkc / Qkc
+--:-:-:-:1  @P3 XMAD     div1, blk_PQkc,    magic_Qkc,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, blk_PQkc,    magic_Qkc.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, blk_PQkc.H1, magic_Qkc.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, blk_PQkc.H1, magic_Qkc,    div1;
+--:-:-:-:1  @P3 IADD3.RS blk_P, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  blk_P, blk_P,    param_shift_Qkc;
+--:-:-:-:1 @!P3 SHR.U32  blk_P, blk_PQkc, param_shift_Qkc;
+
+// blk_Qkc = blk_PQkc % Qkc
+--:-:-:-:1      XMAD.LO2 blk_Qkc, neg_Qkc, blk_P, blk_PQkc;
+
+// blk_Q  = blk_Qkc / kc
+--:-:-:-:1      XMAD.LO2C blk_Q, blk_Qkc, param_magic_kc, RZ;
+--:-:-:-:1      SHR.U32 blk_Q, blk_Q, param_shift_kc;
+// blk_kc = blk_Qkc % kc
+--:-:-:-:1      XMAD.S16.U16  blk_kc, neg_kc, blk_Q, blk_Qkc;
+
+// blk_k = blk_kc / c
+--:-:-:-:1      XMAD    blk_k,  blk_kc, param_magic_c, RZ;
+--:-:-:-:1      SHR.U32 blk_k,  blk_k,  param_shift_c;
+// blk_c = blk_kc % c
+--:-:-:-:1      XMAD.S16.U16 blk_c, neg_c, blk_k, blk_kc;
+
+// blk_K = blk_K*param_k + blk_k
+--:-:-:-:1      XMAD blk_K, blk_K, param_k, blk_k;
+// blk_C = blk_C*param_c + blk_c
+--:-:-:-:1      XMAD blk_C, blk_C, param_c, blk_c;
+
+// Spill these block constants to shared
+--:-:-:-:1      ISETP.EQ.AND P5, PT, tid, RZ, PT;
+--:-:-:-:1  @P5 STS [addr_blk_K], blk_K;
+--:-:-:-:1  @P5 STS [addr_blk_C], blk_C;
+--:-:-:-:1  @P5 STS [addr_blk_P], blk_P;
+--:-:-:-:1  @P5 STS [addr_blk_Q], blk_Q;
+
+// gxs = blk_Q
+// gys = blk_P
+--:-:-:-:1      MOV gxs, blk_Q;
+--:-:-:-:1      MOV gys, blk_P;
+
+[+
+    our $IX;
+    return $IX ? '' : q{
+--:-:-:-:1      BFE.U32 n, tid, param_superN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+// tidX = (tid & 127) >> 2
+// tidY = tid & 3
+// writeS = tidY*512 + tidX + (tidY << 3)
+--:-:-:-:1      BFE.U32 tidX, tid, 0x502; // 5 bits at position 2
+--:-:-:-:1      LOP.AND tidY, tid, 3;
+--:-:-:-:1      ISCADD writeS, tidY, tidX, 9;
+--:-:-:-:1      ISCADD writeS, tidY, writeS, 3;
+--:-:-:-:1      SHL    writeS, writeS,  2;
+    };
++]
+
+// readEs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readIs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid,   -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,  1;
+
+--:-:-:-:1      LOP.AND  tid1,   tid,    1;
+--:-:-:-:1      LOP.AND  readIs, tid,    8;
+--:-:-:-:1      SHR.U32  readIs, readIs, 2;
+--:-:-:-:1      LOP3.LUT readIs, readIs, tid16, tid1, 0xfe;
+--:-:-:-:1      SHL      readIs, readIs, 4;
+
+--:-:-:-:1      BFE.U32  readEs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readEs, readEs, tid16;
+--:-:-:-:1      ISCADD   readEs, readEs, 4x<512*4 + 32>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U ERROR_SETUP;
+
+[+
+    our ($IX, $dtype_shift);
+    return $IX ? qq{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV swapBuffer, 4x<(512*4 + 32)*2>;
+
+// tidY = (tid & 127) / 32
+--:-:-:-:1      BFE.U32 tidY, tid, 0x205; // 2 bits at position 5
+--:-:-:-:1      BFE.U32 n, tid, param_superNI;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+// writeS = (tidY*512 + (tid & 31)*4)*4
+--:-:-:-:1      LOP.AND tidX,   tid, 31;
+--:-:-:-:1      SHL     writeS, tidX, 4;
+--:-:-:-:1      ISCADD  writeS, tidY, writeS, 11;
+// offsetI = I + (tid & 31)*4
+--:-:-:-:1      LEA      offsetI0.CC, tidX, param_I[0],     1x<$dtype_shift + 2>;
+--:-:-:-:1      LEA.HI.X offsetI1,    tidX, param_I[1], RZ, 1x<$dtype_shift + 2>;
+
+</SCHEDULE_BLOCK>
+    } : '';
++]
+
+--:-:-:-:0      MOV blkC, blk_C;
+
+// IMAGE_SETUP
+--:-:-:-:5      CAL IMAGE_OFFSET;
+--:-:-:-:5      CAL IMAGE_LOAD;
+--:-:-:-:5      CAL IMAGE_OFFSET;
+
+[+
+    our ($convert_in, $IX);
+    if ($convert_in)
+    {
+        my $out = $IX ? qq{
+02:-:-:-:1      $convert_in I03, I01.H1;
+--:-:-:-:1      $convert_in I02, I01.H0;
+--:-:-:-:1      $convert_in I01, I00.H1;
+--:-:2:-:1      $convert_in I00, I00.H0;
+
+04:-:-:-:1      $convert_in I13, I11.H1;
+--:-:-:-:1      $convert_in I12, I11.H0;
+--:-:-:-:1      $convert_in I11, I10.H1;
+--:-:3:-:1      $convert_in I10, I10.H0;
+
+08:-:-:-:1      $convert_in I23, I21.H1;
+--:-:-:-:1      $convert_in I22, I21.H0;
+--:-:-:-:1      $convert_in I21, I20.H1;
+--:-:4:-:1      $convert_in I20, I20.H0;
+
+10:-:-:-:1      $convert_in I33, I31.H1;
+--:-:-:-:1      $convert_in I32, I31.H0;
+--:-:-:-:1      $convert_in I31, I30.H1;
+--:-:5:-:1      $convert_in I30, I30.H0;
+        } : qq{
+02:-:-:-:1      $convert_in y0x0, y0x0;
+--:-:-:-:1      $convert_in y0x1, y0x1;
+--:-:-:-:1      $convert_in y0x2, y0x2;
+--:-:2:-:1      $convert_in y0x3, y0x3;
+
+04:-:-:-:1      $convert_in y2x0, y2x0;
+--:-:-:-:1      $convert_in y2x1, y2x1;
+--:-:-:-:1      $convert_in y2x2, y2x2;
+--:-:3:-:1      $convert_in y2x3, y2x3;
+
+08:-:-:-:1      $convert_in y1x0, y1x0;
+--:-:-:-:1      $convert_in y1x1, y1x1;
+--:-:-:-:1      $convert_in y1x2, y1x2;
+--:-:4:-:1      $convert_in y1x3, y1x3;
+
+10:-:-:-:1      $convert_in y3x0, y3x0;
+--:-:-:-:1      $convert_in y3x1, y3x1;
+--:-:-:-:1      $convert_in y3x2, y3x2;
+--:-:5:-:1      $convert_in y3x3, y3x3;
+        };
+        return qq{
+<SCHEDULE_BLOCK>
+<ORDERED>
+$out
+</ORDERED>
+--:-:-:-:1      NOP; # we need 20 total conversions.  that's 4 short of instruction 2 cache lines
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+</SCHEDULE_BLOCK>
+        };
+    }
+    return '';
++]
+
+[+
+    our $IX;
+    return $IX ? q{
+02:-:-:-:1      STS.128 [writeS + 4x<00*4>], I0;
+04:-:-:-:1      STS.128 [writeS + 4x<32*4>], I1;
+08:-:-:-:1      STS.128 [writeS + 4x<64*4>], I2;
+10:-:-:-:1      STS.128 [writeS + 4x<96*4>], I3;
+
+// init = bNextY ? 1 : 0
+--:-:-:-:0      SEL pred_bits, RZ, 1, !P6;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS,  swapBuffer;
+--:-:-:-:0      IADD swapBuffer, RZ, -swapBuffer;
+
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>];
+
+--:-:-:-:5      CAL IMAGE_LOAD;
+
+// init += bNextY ? 1 : 0
+--:-:-:-:0  @P6 IADD pred_bits, pred_bits, 1;
+
+--:-:-:-:5      CAL IMAGE_OFFSET;
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+    } : q{
+<SCHEDULE_BLOCK>
+<ORDERED>
+06:-:-:-:1      FADD Y0X0, y0x0, -y2x0;
+--:-:-:-:1      FADD Y0X1, y0x1, -y2x1;
+--:-:-:-:1      FADD Y0X2, y0x2, -y2x2;
+--:-:-:-:1      FADD Y0X3, y0x3, -y2x3;
+--:-:-:-:1      FADD I00,  Y0X0, -Y0X2;
+--:-:-:-:1      FADD I03, -Y0X1,  Y0X3;
+--:-:-:-:1      FADD I01,  Y0X1,  Y0X2;
+--:-:-:-:1      FADD I02,  Y0X2, -Y0X1;
+--:-:-:-:1      STS [writeS + 4x<32*00>], I00;
+--:-:-:-:1      STS [writeS + 4x<32*03>], I03;
+--:-:-:-:1      STS [writeS + 4x<32*01>], I01;
+--:6:-:-:1      STS [writeS + 4x<32*02>], I02;
+18:-:-:-:1      FADD Y3X0, -y1x0, y3x0;
+--:-:-:-:1      FADD Y3X1, -y1x1, y3x1;
+--:-:-:-:1      FADD Y3X2, -y1x2, y3x2;
+--:-:-:-:1      FADD Y3X3, -y1x3, y3x3;
+--:-:-:-:1      FADD I12,  Y3X0, -Y3X2;
+--:-:-:-:1      FADD I15, -Y3X1,  Y3X3;
+--:-:-:-:1      FADD I13,  Y3X1,  Y3X2;
+--:-:-:-:1      FADD I14,  Y3X2, -Y3X1;
+--:-:-:-:1      STS [writeS + 4x<32*12>], I12;
+--:-:-:-:1      STS [writeS + 4x<32*15>], I15;
+--:-:-:-:1      STS [writeS + 4x<32*13>], I13;
+--:-:-:-:1      STS [writeS + 4x<32*14>], I14;
+20:-:-:-:1      FADD Y1X0, y1x0,  y2x0;
+--:-:-:-:1      FADD Y1X1, y1x1,  y2x1;
+--:-:-:-:1      FADD Y1X2, y1x2,  y2x2;
+--:-:-:-:1      FADD Y1X3, y1x3,  y2x3;
+--:-:-:-:1      FADD Y2X0, y2x0, -y1x0;
+--:-:-:-:1      FADD Y2X1, y2x1, -y1x1;
+--:-:-:-:1      FADD Y2X2, y2x2, -y1x2;
+--:-:-:-:1      FADD Y2X3, y2x3, -y1x3;
+--:-:-:-:1      FADD I04,  Y1X0, -Y1X2;
+--:-:-:-:1      FADD I05,  Y1X1,  Y1X2;
+--:-:-:-:1      FADD I06,  Y1X2, -Y1X1;
+--:-:-:-:1      FADD I07, -Y1X1,  Y1X3;
+--:-:-:-:1      STS [writeS + 4x<32*04>], I04;
+--:-:-:-:1      STS [writeS + 4x<32*05>], I05;
+--:-:-:-:1      STS [writeS + 4x<32*06>], I06;
+--:-:-:-:1      STS [writeS + 4x<32*07>], I07;
+--:-:-:-:1      FADD I08,  Y2X0, -Y2X2;
+--:-:-:-:1      FADD I11, -Y2X1,  Y2X3;
+--:-:-:-:1      FADD I09,  Y2X1,  Y2X2;
+--:-:-:-:1      FADD I10,  Y2X2, -Y2X1;
+--:-:-:-:1      STS [writeS + 4x<32*08>], I08;
+--:-:-:-:1      STS [writeS + 4x<32*11>], I11;
+--:-:-:-:1      STS [writeS + 4x<32*09>], I09;
+--:-:-:-:1      STS [writeS + 4x<32*10>], I10;
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+// init = bNextY ? 1 : 0
+--:-:-:-:0      SEL init, RZ, 1, !P6;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      IADD writeS, writeS, 4x<(512*4 + 32)*2>;
+
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>];
+
+--:-:-:-:5      CAL IMAGE_LOAD;
+
+// init += bNextY ? 1 : 0
+--:-:-:-:0  @P6 IADD init, init, 1;
+--:-:-:-:5      CAL IMAGE_OFFSET;
+--:-:-:-:0      BFI pred_bits, init, 0x214, pred_bits; // 2 bits at position 20
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+    };
++]
+
+
+IMAGE_OFFSET:
+
+<SCHEDULE_BLOCK>
+[+
+    our ($dtype_shift, $IX);
+    return $IX ? qq{
+
+--:-:-:-:1      BFE.U32 super_x, tid, param_superXI;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superYI;
+--:-:-:-:1      SHL gx, gxs, param_shiftXI;
+--:-:-:-:1      SHL gy, gys, param_shiftYI;
+--:-:-:-:1      IADD gx, gx, super_x;
+--:-:-:-:1      IADD gy, gy, super_y;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, gx, param_GX, P4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, gy, param_GY, P0;
+
+// offset = blkC*GY*GX*N + gy*GX*N + gx*N + n
+--:-:-:-:1      XMAD.U16.U16      offset, gx,   param_N,   n;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, gy,   param_XN,  offset;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, blkC, param_YXN, offset;
+
+// trackI = offsetI + offset*512
+20:-:-:-:1      LEA      trackI0.CC, offset, offsetI0,     1x<$dtype_shift + 9>;
+--:-:-:-:0      LEA.HI.X trackI1,    offset, offsetI1, RZ, 1x<$dtype_shift + 9>;
+    } : qq{
+// Calc superblock coordinates
+01:-:-:-:1      SHL x, gxs, param_shiftX;
+--:-:-:-:1      SHL y, gys, param_shiftY;
+
+// Calc this thread's sub-block coordinates
+--:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superY;
+--:-:-:-:1      ISCADD x, super_x,  x, 1;
+--:-:-:-:1      ISCADD y, super_y,  y, 1;
+
+// Apply padding
+--:-:-:-:1      IADD x, x, -param_pad_x;
+--:-:-:-:1      IADD y, y, -param_pad_y;
+
+// c = blkC*32 + tidX
+--:-:-:-:1      BFE.U32 tid_X, tid, 0x502; // 5 bits at position 2
+--:-:-:-:1      ISCADD c, blkC, tid_X, 5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, c, param_C, P4;
+
+// offset = c*YXN + y*XN + x*N + n
+--:-:-:-:1      XMAD.S16.U16      offset, x, param_N,   n;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, y, param_XN,  offset;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, c, param_YXN, offset;
+--:-:-:-:1      ISET.LT.AND offsign, offset, RZ, PT;
+
+20:-:-:-:1      LEA    track00.CC, offset,  param_I[0], $dtype_shift;
+--:-:-:-:1      IADD.X track01,    offsign, param_I[1];
+--:-:-:-:1      IADD   track10.CC, track00, param_Np;
+--:-:-:-:1      IADD.X track11,    track01, RZ;
+--:-:-:-:1      IADD   track20.CC, track10, param_Np;
+--:-:-:-:1      IADD.X track21,    track11, RZ;
+--:-:-:-:1      IADD   track30.CC, track20, param_Np;
+--:-:-:-:1      IADD.X track31,    track21, RZ;
+
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      IADD x2, x, 2;
+--:-:-:-:1      IADD x3, x, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  param_X, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_X, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_X, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_X, P4;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD x1, y, 1;
+--:-:-:-:1      IADD x2, y, 2;
+--:-:-:-:1      IADD x3, y, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, y,  param_Y, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_Y, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_Y, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_Y, P4;
+--:-:-:-:1      ISETP.GE.AND P0, PT, y,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+
+--:-:-:-:1      SEL pred_bits, mask_x, RZ, P0;
+--:-:-:-:1  \@P1 BFI pred_bits, mask_x, 0x404, pred_bits;
+--:-:-:-:1  \@P2 BFI pred_bits, mask_x, 0x408, pred_bits;
+--:-:-:-:1  \@P3 BFI pred_bits, mask_x, 0x40c, pred_bits;
+
+// Cache y preds in high bits
+--:-:-:-:1      P2R mask_y, PR, RZ, 0x0f;
+--:-:-:-:0      BFI pred_bits, mask_y, 0x410, pred_bits; // 4 bits at position 16
+    };
++]
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
+IMAGE_LOAD:
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+[+
+    our ($dtype, $dtype_shift, $IX, $vec_size, $dtype_size);
+    return $IX ? qq{
+
+--:-:2:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];
+--:-:3:-:1 \@!P0 LDS.U.$vec_size I1, [addr_zero];
+--:-:4:-:1 \@!P0 LDS.U.$vec_size I2, [addr_zero];
+--:-:5:-:1 \@!P0 LDS.U.$vec_size I3, [addr_zero];
+
+--:-:2:-:1  \@P0 LDG.E.CG.$vec_size I0, [trackI + 4x<00 * $dtype_size>];
+--:-:3:-:1  \@P0 LDG.E.CG.$vec_size I1, [trackI + 4x<32 * $dtype_size>];
+--:-:4:-:1  \@P0 LDG.E.CG.$vec_size I2, [trackI + 4x<64 * $dtype_size>];
+--:6:5:-:1  \@P0 LDG.E.CG.$vec_size I3, [trackI + 4x<96 * $dtype_size>];
+
+    } : qq{
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1      SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;
+
+--:-:-:-:1 \@!P0 MOV y0x0, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI$dtype y0x0, [track0];
+--:-:-:-:1 \@!P1 MOV y0x1, RZ;
+--:-:-:-:1  \@P1 LDG.E.CI$dtype y0x1, [track1];
+--:-:-:-:1 \@!P2 MOV y0x2, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI$dtype y0x2, [track2];
+--:-:-:-:1 \@!P3 MOV y0x3, RZ;
+--:6:2:-:1  \@P3 LDG.E.CI$dtype y0x3, [track3];
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1      SHF.L.U64 pred_bits, pred_bits, 4, pred_bits;
+20:-:-:-:1      IADD   track00.CC, track00, param_2XNp;
+--:-:-:-:1      IADD.X track01,    track01, RZ;
+--:-:-:-:1      IADD   track10.CC, track10, param_2XNp;
+--:-:-:-:1      IADD.X track11,    track11, RZ;
+--:-:-:-:1      IADD   track20.CC, track20, param_2XNp;
+--:-:-:-:1      IADD.X track21,    track21, RZ;
+--:-:-:-:1      IADD   track30.CC, track30, param_2XNp;
+--:-:-:-:1      IADD.X track31,    track31, RZ;
+
+--:-:-:-:1 \@!P0 MOV y2x0, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI$dtype y2x0, [track0];
+--:-:-:-:1 \@!P1 MOV y2x1, RZ;
+--:-:-:-:1  \@P1 LDG.E.CI$dtype y2x1, [track1];
+--:-:-:-:1 \@!P2 MOV y2x2, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI$dtype y2x2, [track2];
+--:-:-:-:1 \@!P3 MOV y2x3, RZ;
+--:6:3:-:1  \@P3 LDG.E.CI$dtype y2x3, [track3];
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1      SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;
+20:-:-:-:1      IADD   track00.CC, track00, -param_XNp;
+--:-:-:-:1      IADD.X track01,    track01, -RZ;
+--:-:-:-:1      IADD   track10.CC, track10, -param_XNp;
+--:-:-:-:1      IADD.X track11,    track11, -RZ;
+--:-:-:-:1      IADD   track20.CC, track20, -param_XNp;
+--:-:-:-:1      IADD.X track21,    track21, -RZ;
+--:-:-:-:1      IADD   track30.CC, track30, -param_XNp;
+--:-:-:-:1      IADD.X track31,    track31, -RZ;
+
+--:-:-:-:1 \@!P0 MOV y1x0, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI$dtype y1x0, [track0];
+--:-:-:-:1 \@!P1 MOV y1x1, RZ;
+--:-:-:-:1  \@P1 LDG.E.CI$dtype y1x1, [track1];
+--:-:-:-:1 \@!P2 MOV y1x2, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI$dtype y1x2, [track2];
+--:-:-:-:1 \@!P3 MOV y1x3, RZ;
+--:6:4:-:1  \@P3 LDG.E.CI$dtype y1x3, [track3];
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1      SHF.L.U64 pred_bits, pred_bits, 12, pred_bits;
+20:-:-:-:1      IADD   track00.CC, track00, param_2XNp;
+--:-:-:-:1      IADD.X track01,    track01, RZ;
+--:-:-:-:1      IADD   track10.CC, track10, param_2XNp;
+--:-:-:-:1      IADD.X track11,    track11, RZ;
+--:-:-:-:1      IADD   track20.CC, track20, param_2XNp;
+--:-:-:-:1      IADD.X track21,    track21, RZ;
+--:-:-:-:1      IADD   track30.CC, track30, param_2XNp;
+--:-:-:-:1      IADD.X track31,    track31, RZ;
+
+--:-:-:-:1 \@!P0 MOV y3x0, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI$dtype y3x0, [track0];
+--:-:-:-:1 \@!P1 MOV y3x1, RZ;
+--:-:-:-:1  \@P1 LDG.E.CI$dtype y3x1, [track1];
+--:-:-:-:1 \@!P2 MOV y3x2, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI$dtype y3x2, [track2];
+--:-:-:-:1 \@!P3 MOV y3x3, RZ;
+--:6:5:-:1  \@P3 LDG.E.CI$dtype y3x3, [track3];
+    };
++]
+</ORDERED>
+
+// Advance offset/preds
+--:-:-:-:1      IADD n, n, param_loopN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+--:-:-:-:1 @!P4 BFE.U32 n, tid, param_superNI;
+--:-:-:-:1 @!P4 IADD gxs, gxs, param_strideX;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, PT;
+
+--:-:-:-:1 @!P5 MOV  gxs, blk_Q;
+--:-:-:-:1 @!P5 IADD gys, gys, param_strideY;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n,   param_N,  P6;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
+ERROR_SETUP:
+
+[+
+    our $IX;
+    return $IX ? q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      BFE.U32 n, tid, param_superN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+// tidX = (tid & 127) >> 2
+// tidY = tid & 3
+// writeS = tidY*512 + tidX + (tidY << 3)
+--:-:-:-:1      BFE.U32 tidX, tid, 0x502; // 5 bits at position 2
+--:-:-:-:1      LOP.AND tidY, tid, 3;
+--:-:-:-:1      ISCADD writeS, tidY, tidX, 9;
+--:-:-:-:1      ISCADD writeS, tidY, writeS, 3;
+--:-:-:-:1      SHL    writeS, writeS,  2;
+</SCHEDULE_BLOCK>
+    } : '';
++]
+
+--:-:-:-:0      MOV blkK, blk_K;
+
+--:-:-:-:5      CAL ERROR_OFFSET;
+--:-:-:-:5      CAL ERROR_LOAD;
+--:-:-:-:5      CAL ERROR_OFFSET;
+
+<SCHEDULE_BLOCK>
+[+
+    our ($convert_in);
+    return $convert_in ? qq{
+<ORDERED>
+02:-:2:-:1      $convert_in p0q0, p0q0;
+04:-:3:-:1      $convert_in p0q1, p0q1;
+08:-:4:-:1      $convert_in p1q1, p1q1;
+10:-:5:-:1      $convert_in p1q0, p1q0;
+</ORDERED>
+    } : '';
++]
+
+<ORDERED>
+02:-:-:-:1      FMUL e0,  p0q0,  0.5;
+04:-:-:-:1      FFMA E01, p0q1,  0.5,  e0;
+--:-:-:-:1      FFMA E02, p0q1, -0.5,  e0;
+08:-:-:-:1      FMUL e1,  p1q1,  0.5;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*00 + 32>], E00;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*01 + 32>], E01;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*02 + 32>], E02;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*03 + 32>], E03;
+10:-:-:-:1      FFMA E13, p1q0,  0.5,  e1;
+--:-:-:-:1      FFMA E14, p1q0,  0.5, -e1;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*12 + 32>], E12;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*15 + 32>], E15;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*13 + 32>], E13;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*14 + 32>], E14;
+--:-:-:-:1      FFMA B0,  p1q0,  0.5,  e0;
+--:-:-:-:1      FFMA C0,  p1q0, -0.5,  e0;
+--:-:-:-:1      FFMA B1,  p0q1,  0.5,  e1;
+--:-:-:-:1      FFMA C1,  p0q1,  0.5, -e1;
+--:-:-:-:1      FMUL e2,  B0,  0.5;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*04 + 32>], E04;
+--:-:-:-:1      FMUL e3,  C0,  0.5;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*08 + 32>], E08;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*07 + 32>], E07;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*11 + 32>], E11;
+--:-:-:-:1      FFMA E05, B1,  0.5,  e2;
+--:-:-:-:1      FFMA E06, B1, -0.5,  e2;
+--:-:-:-:1      FFMA E09, C1,  0.5,  e3;
+--:-:-:-:1      FFMA E10, C1, -0.5,  e3;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*05 + 32>], E05;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*06 + 32>], E06;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*09 + 32>], E09;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*10 + 32>], E10;
+</ORDERED>
+
+</SCHEDULE_BLOCK>
+
+// init = bNextY ? 1 : 0
+--:-:-:-:0      SEL init, RZ, 1, !P6;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      IADD writeS, writeS, 4x<(512*4 + 32)*2>;
+
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>];
+
+--:-:-:-:5      CAL ERROR_LOAD;
+
+// init += bNextY ? 1 : 0
+--:-:-:-:0  @P6 IADD init, init, 1;
+--:-:-:-:5      CAL ERROR_OFFSET;
+--:-:-:-:0      BFI pred_bits, init, 0x208, pred_bits; // 2 bits at position 8
+--:-:-:-:5      BRA.U ERROR_LOOP;
+
+ERROR_OFFSET:
+
+<SCHEDULE_BLOCK>
+// Calc superblock coordinates
+01:-:-:-:1      SHL x, gxs, param_shiftX;
+--:-:-:-:1      SHL y, gys, param_shiftY;
+
+// Calc this thread's sub-block coordinates
+--:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superY;
+--:-:-:-:1      ISCADD x, super_x,  x, 1;
+--:-:-:-:1      ISCADD y, super_y,  y, 1;
+
+// k = blkK*32 + tidX  (have k share register with c)
+--:-:-:-:1      BFE.U32 tid_X, tid, 0x502; // 5 bits at position 2
+--:-:-:-:1      ISCADD c, blkK, tid_X, 5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, c, param_K, P4;
+
+// offset0 = k*PQN + y*QN + x*N + n
+// offset1 = offset0 + N
+// offset2 = offset0 + QN
+// offset3 = offset1 + QN
+--:-:-:-:1      XMAD.S16.U16      offset, x, param_N,   n;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, y, param_QN,  offset;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, c, param_PQN, offset;
+
+20:-:-:-:1      LEA    track00.CC, offset,  param_E[0], [+ dtype_shift() +];
+--:-:-:-:1      IADD.X track01,    RZ,      param_E[1];
+--:-:-:-:1      IADD   track10.CC, track00, param_Np;
+--:-:-:-:1      IADD.X track11,    track01, RZ;
+--:-:-:-:1      IADD   track20.CC, track00, param_QNp;
+--:-:-:-:1      IADD.X track21,    track01, RZ;
+--:-:-:-:1      IADD   track30.CC, track10, param_QNp;
+--:-:-:-:0      IADD.X track31,    track11, RZ;
+
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      IADD x2, y, 1;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y,  param_P, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x2, param_P, P4;
+
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, y,  RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x2, RZ, P3;
+
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x03;
+--:-:-:-:1      P2R mask_y, PR, RZ, 0x0c;
+
+--:-:-:-:1      SEL pred_bits, mask_x, RZ, P2;
+--:-:-:-:1  @P3 BFI pred_bits, mask_x, 0x202, pred_bits; // 2 bits at position 2
+
+// Cache y preds in high bits
+--:-:-:-:0      BFI pred_bits, mask_y, 0x404, pred_bits; // 4 bits at position 4
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
+ERROR_LOAD:
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1 @!P0 MOV p0q0, RZ;
+--:-:2:-:1  @P0 LDG.E.CI[+ dtype() +] p0q0, [track0];
+--:-:-:-:1 @!P1 MOV p0q1, RZ;
+--:-:3:-:1  @P1 LDG.E.CI[+ dtype() +] p0q1, [track1];
+--:-:-:-:1 @!P3 MOV p1q1, RZ;
+--:-:4:-:1  @P3 LDG.E.CI[+ dtype() +] p1q1, [track3];
+--:-:-:-:1 @!P2 MOV p1q0, RZ;
+--:6:5:-:1  @P2 LDG.E.CI[+ dtype() +] p1q0, [track2];
+
+</ORDERED>
+
+// Advance offset/preds
+--:-:-:-:1      IADD n, n, param_loopN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+--:-:-:-:1 @!P4 BFE.U32 n, tid, param_superN;
+--:-:-:-:1 @!P4 IADD gxs, gxs, param_strideX;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, PT;
+
+--:-:-:-:1 @!P5 MOV  gxs, blk_Q;
+--:-:-:-:1 @!P5 IADD gys, gys, param_strideY;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n,   param_N,   P6;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
+
+IMAGE_LOOP:
+
+[+
+    our ($dtype, $dtype_shift, $dtype_size, $vec_size, $convert_in, $IX);
+    my %insert = (
+
+        $IX ? (
+
+            j0c8  => "--:-:-:-:1      ISETP.LT.AND P0, PT, gx, param_GX, P6;\n",
+            j0c20 => "--:-:-:-:1      ISETP.LT.AND P0, PT, gy, param_GY, P0;\n",
+
+            j1c10 => "20:-:-:-:1  \@P0 LEA      trackI0.CC, offset, offsetI0,     1x<$dtype_shift + 9>;\n",
+            j1c15 => "--:-:-:-:1  \@P0 LEA.HI.X trackI1,    offset, offsetI1, RZ, 1x<$dtype_shift + 9>;\n",
+
+            j1c32 => "02:2:-:-:1      STS.128 [writeS + 4x<00*4>], I0;\n",
+            j1c36 => "02:-:2:-:1  \@P0 LDG.E.CG.$vec_size I0, [trackI + 4x<00 * $dtype_size>];\n",
+            j1c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];\n",
+
+            j1c56 => "04:3:-:-:1      STS.128 [writeS + 4x<32*4>], I1;\n",
+            j1c60 => "04:-:3:-:1  \@P0 LDG.E.CG.$vec_size I1, [trackI + 4x<32 * $dtype_size>];\n",
+            j1c62 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I1, [addr_zero];\n",
+
+
+            j2c32 => "08:4:-:-:1      STS.128 [writeS + 4x<64*4>], I2;\n",
+            j2c36 => "08:-:4:-:1  \@P0 LDG.E.CG.$vec_size I2, [trackI + 4x<64 * $dtype_size>];\n",
+            j2c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I2, [addr_zero];\n",
+
+
+            j2c56 => "10:5:-:-:1      STS.128 [writeS + 4x<96*4>], I3;\n",
+            j2c60 => "10:6:5:-:1  \@P0 LDG.E.CG.$vec_size I3, [trackI + 4x<96 * $dtype_size>];\n",
+            j2c62 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I3, [addr_zero];\n",
+
+            $convert_in ? (
+                j1c16 => "02:-:-:-:1      $convert_in I03, I01.H1;\n",
+                j1c20 => "--:-:-:-:1      $convert_in I02, I01.H0;\n",
+                j1c24 => "--:-:-:-:1      $convert_in I01, I00.H1;\n",
+                j1c28 => "--:-:2:-:1      $convert_in I00, I00.H0;\n",
+
+                j1c40 => "04:-:-:-:1      $convert_in I13, I11.H1;\n",
+                j1c44 => "--:-:-:-:1      $convert_in I12, I11.H0;\n",
+                j1c48 => "--:-:-:-:1      $convert_in I11, I10.H1;\n",
+                j1c52 => "--:-:3:-:1      $convert_in I10, I10.H0;\n",
+
+                j2c16 => "08:-:-:-:1      $convert_in I23, I21.H1;\n",
+                j2c20 => "--:-:-:-:1      $convert_in I22, I21.H0;\n",
+                j2c24 => "--:-:-:-:1      $convert_in I21, I20.H1;\n",
+                j2c28 => "--:-:4:-:1      $convert_in I20, I20.H0;\n",
+
+                j2c40 => "10:-:-:-:1      $convert_in I33, I31.H1;\n",
+                j2c44 => "--:-:-:-:1      $convert_in I32, I31.H0;\n",
+                j2c48 => "--:-:-:-:1      $convert_in I31, I30.H1;\n",
+                j2c52 => "--:-:5:-:1      $convert_in I30, I30.H0;\n",
+            ) : (),
+
+            j2c63 => "--:-:-:-:1      IADD n,      n,      param_loopN;\n" .
+                     "--:-:-:-:0      IADD offset, offset, param_loopN;\n".
+                     "--:-:-:-:5      BAR.SYNC 0;\n" .
+                     "--:-:-:-:1      IADD readIs, readIs, -swapBuffer;\n" .
+                     "--:-:-:-:1      IADD readEs, readEs, -swapBuffer;\n" .
+                     "--:-:-:-:1      IADD writeS, writeS,  swapBuffer;\n" .
+                     "--:-:-:-:1      IADD swapBuffer, RZ, -swapBuffer;\n",
+
+            j3c8  => "--:-:-:-:1      PSETP.OR.AND P4, PT, P5, P6, PT;\n",
+            j3c21 => "--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, P4;\n",
+
+            j3c34 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n",
+
+            j3c63 => "--:-:-:Y:5  \@P4 BRA.U IMAGE_LOOP;\n",
+
+        ) : (
+
+            $convert_in ? (
+                j0c37 => "02:-:-:-:1      $convert_in y0x0, y0x0;\n",
+                j0c41 => "--:-:-:-:1      $convert_in y0x1, y0x1;\n",
+                j0c45 => "--:-:-:-:1      $convert_in y0x2, y0x2;\n",
+                j0c49 => "--:-:2:-:1      $convert_in y0x3, y0x3;\n",
+
+                j0c53 => "04:-:-:-:1      $convert_in y2x0, y2x0;\n",
+                j0c57 => "--:-:-:-:1      $convert_in y2x1, y2x1;\n",
+                j0c61 => "--:-:-:-:1      $convert_in y2x2, y2x2;\n",
+                j1c1  => "--:-:3:-:1      $convert_in y2x3, y2x3;\n",
+
+                j1c5  => "08:-:-:-:1      $convert_in y1x0, y1x0;\n",
+                j1c10 => "--:-:-:-:1      $convert_in y1x1, y1x1;\n",
+                j1c14 => "--:-:-:-:1      $convert_in y1x2, y1x2;\n",
+                j1c16 => "--:-:4:-:1      $convert_in y1x3, y1x3;\n",
+
+                j1c21 => "10:-:-:-:1      $convert_in y3x0, y3x0;\n",
+                j1c23 => "--:-:-:-:1      $convert_in y3x1, y3x1;\n",
+                j1c27 => "--:-:-:-:1      $convert_in y3x2, y3x2;\n",
+                j1c29 => "--:-:5:-:1      $convert_in y3x3, y3x3;\n",
+            ) : (),
+
+            j1c22 => "06:-:-:-:1      FADD Y0X0, y0x0, -y2x0;\n" .
+                     "--:-:-:-:1      FADD Y0X1, y0x1, -y2x1;\n",
+
+            j1c24 => "--:-:-:-:1      FADD Y0X2, y0x2, -y2x2;\n" .
+                     "--:-:-:-:1      FADD Y0X3, y0x3, -y2x3;\n",
+
+            j1c28 => "--:-:-:-:1      FADD I00,  Y0X0, -Y0X2;\n" .
+                     "--:-:-:-:1      FADD I03, -Y0X1,  Y0X3;\n",
+            j1c30 => "--:-:-:-:1      FADD I01,  Y0X1,  Y0X2;\n" .
+                     "--:-:-:-:1      FADD I02,  Y0X2, -Y0X1;\n",
+
+            j1c31 => "--:-:-:-:1      STS [writeS + 4x<32*00>], I00;\n",
+            j1c33 => "--:-:-:-:1      STS [writeS + 4x<32*03>], I03;\n",
+            j1c35 => "--:-:-:-:1      STS [writeS + 4x<32*01>], I01;\n",
+            j1c37 => "--:2:-:-:1      STS [writeS + 4x<32*02>], I02;\n",
+
+            j1c39 => "18:-:-:-:1      FADD Y3X0, -y1x0, y3x0;\n" .
+                     "--:-:-:-:1      FADD Y3X1, -y1x1, y3x1;\n" .
+                     "--:-:-:-:1      FADD Y3X2, -y1x2, y3x2;\n" .
+                     "--:-:-:-:1      FADD Y3X3, -y1x3, y3x3;\n",
+
+            j1c43 => "--:-:-:-:1      FADD I12,  Y3X0, -Y3X2;\n" .
+                     "--:-:-:-:1      FADD I15, -Y3X1,  Y3X3;\n" .
+                     "--:-:-:-:1      FADD I13,  Y3X1,  Y3X2;\n" .
+                     "--:-:-:-:1      FADD I14,  Y3X2, -Y3X1;\n",
+
+            j1c44 => "--:-:-:-:1      STS [writeS + 4x<32*12>], I12;\n",
+            j1c46 => "--:-:-:-:1      STS [writeS + 4x<32*15>], I15;\n",
+            j1c48 => "--:-:-:-:1      STS [writeS + 4x<32*13>], I13;\n",
+            j1c50 => "--:-:-:-:1      STS [writeS + 4x<32*14>], I14;\n",
+
+            j1c52 => "--:-:-:-:1      R2P PR, pred_bits, 0x0f;\n" .
+                     "--:-:-:-:1      SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;\n",
+
+            j1c53 => "--:-:-:-:1  \@P6 ISET.LT.AND off_sign, offset, RZ, PT;\n" .
+                     "--:-:-:-:1  \@P6 LEA    track00.CC, offset,  param_I[0], $dtype_shift;\n",
+
+            j1c58 => "--:-:-:-:1  \@P6 IADD.X track01,    off_sign, param_I[1];\n" .
+                     "--:-:-:-:1  \@P6 IADD   track10.CC, track00, param_Np;\n",
+
+            j2c18 => "--:-:-:-:1      FADD Y1X0, y1x0,  y2x0;\n" .
+                     "--:-:-:-:1      FADD Y1X1, y1x1,  y2x1;\n" .
+                     "--:-:-:-:1      FADD Y1X2, y1x2,  y2x2;\n" .
+                     "--:-:-:-:1      FADD Y1X3, y1x3,  y2x3;\n" .
+                     "--:-:-:-:1      FADD Y2X0, y2x0, -y1x0;\n" .
+                     "--:-:-:-:1      FADD Y2X1, y2x1, -y1x1;\n" .
+                     "--:-:-:-:1      FADD Y2X2, y2x2, -y1x2;\n" .
+                     "--:-:-:-:1      FADD Y2X3, y2x3, -y1x3;\n" .
+                     "--:-:-:-:1      FADD I04,  Y1X0, -Y1X2;\n" .
+                     "--:-:-:-:1      FADD I05,  Y1X1,  Y1X2;\n" .
+                     "--:-:-:-:1      FADD I06,  Y1X2, -Y1X1;\n" .
+                     "--:-:-:-:1      FADD I07, -Y1X1,  Y1X3;\n",
+
+            j2c19 => "--:-:-:-:1      STS [writeS + 4x<32*04>], I04;\n",
+            j2c21 => "--:-:-:-:1      STS [writeS + 4x<32*05>], I05;\n",
+            j2c23 => "--:-:-:-:1      STS [writeS + 4x<32*06>], I06;\n",
+            j2c25 => "--:-:-:-:1      STS [writeS + 4x<32*07>], I07;\n",
+
+            j2c27 => "--:-:-:-:1  \@P6 IADD.X track11,    track01, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track20.CC, track10, param_Np;\n",
+
+            j2c31 => "--:-:-:-:1      FADD I08,  Y2X0, -Y2X2;\n" .
+                     "--:-:-:-:1      FADD I11, -Y2X1,  Y2X3;\n" .
+                     "--:-:-:-:1      FADD I09,  Y2X1,  Y2X2;\n" .
+                     "--:-:-:-:1      FADD I10,  Y2X2, -Y2X1;\n",
+
+            j2c32 => "--:-:-:-:1      STS [writeS + 4x<32*08>], I08;\n",
+            j2c34 => "--:-:-:-:1      STS [writeS + 4x<32*11>], I11;\n",
+            j2c36 => "--:-:-:-:1      STS [writeS + 4x<32*09>], I09;\n",
+            j2c38 => "--:-:-:-:1      STS [writeS + 4x<32*10>], I10;\n",
+
+            j2c40 => "--:-:-:-:1  \@P6 IADD.X track21,    track11, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track30.CC, track20, param_Np;\n",
+
+            j2c44 => "--:-:-:-:1      LOP.AND.NZ P4, RZ, pred_bits, 0x4000;\n" .
+                     "--:-:-:-:1      LOP.XOR pred_bits, pred_bits, 0x4000;\n",
+
+            j2c46 => "--:-:-:-:1  \@P6 IADD.X track31,    track21, RZ;\n" .
+                     "--:-:-:-:1      IADD n, n, param_loopN;\n" .
+                     "--:-:-:-:1      IADD offset, offset, param_loopN;\n",
+
+            j2c62 => "--:-:-:-:1  \@P4 MOV swapBuf,  4x<(512*4 + 32)*2>;\n" .
+                     "--:-:-:-:1 \@!P4 MOV swapBuf, -4x<(512*4 + 32)*2>;\n",
+
+            j2c63 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                     "--:-:-:-:0      IADD readIs, readIs, -swapBuf;\n" .
+                     "--:-:-:-:1 \@!P0 I2I.U32.U32 y0x0, RZ;\n" .
+                     "--:-:-:-:0      IADD readEs, readEs, -swapBuf;\n" .
+                     "--:-:-:-:1  \@P0 LDG.E.CI$dtype y0x0, [track0];\n" .
+                     "--:-:-:-:0      IADD writeS, writeS,  swapBuf;\n" .
+                     "--:-:-:-:1 \@!P1 I2I.U32.U32 y0x1, RZ;\n" .
+                     "--:-:-:-:1  \@P1 LDG.E.CI$dtype y0x1, [track1];\n",
+
+            j3c0  => "--:-:-:-:1 \@!P2 I2I.U32.U32 y0x2, RZ;\n",
+            j3c1  => "--:-:-:-:1  \@P2 LDG.E.CI$dtype y0x2, [track2];\n",
+            j3c2  => "--:-:-:-:1 \@!P3 I2I.U32.U32 y0x3, RZ;\n",
+            j3c3  => "--:6:2:-:1  \@P3 LDG.E.CI$dtype y0x3, [track3];\n" .
+                     "--:-:-:Y:8      R2P PR, pred_bits, 0x0f;\n" .
+                     "20:-:-:-:1  \@P6 IADD   track00.CC, track00, param_2XNp;\n" .
+                     "--:-:-:-:1      SHF.L.U64 pred_bits, pred_bits, 4, pred_bits;\n",
+
+            j3c7  => "--:-:-:-:1 \@!P0 I2I.U32.U32 y2x0, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track01,    track01, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track10.CC, track10, param_2XNp;\n",
+
+            j3c9  => "--:-:-:-:1      PSETP.OR.AND P4, PT, P5, P6, PT;\n",
+
+            j3c11 => "--:-:-:-:1  \@P0 LDG.E.CI$dtype y2x0, [track0];\n" .
+                     "--:-:-:-:0  \@P6 IADD.X track11,    track11, RZ;\n" .
+                     "--:-:-:-:1 \@!P1 I2I.U32.U32 y2x1, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track20.CC, track20, param_2XNp;\n",
+
+            j3c12 => "--:-:-:-:1  \@P1 LDG.E.CI$dtype y2x1, [track1];\n",
+
+            j3c16 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y2x2, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track21,    track21, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track30.CC, track30, param_2XNp;\n",
+
+            j3c17 => "--:-:-:-:1  \@P2 LDG.E.CI$dtype y2x2, [track2];\n",
+
+
+            j3c21 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y2x3, RZ;\n" .
+                     "--:-:-:-:2  \@P6 IADD.X track31,    track31, RZ;\n",
+
+            j3c22 => "--:6:3:-:1  \@P3 LDG.E.CI$dtype y2x3, [track3];\n" .
+                     "--:-:-:Y:8      R2P PR, pred_bits, 0x0f;\n" .
+                     "20:-:-:-:1  \@P6 IADD   track00.CC, track00, -param_XNp;\n" .
+                     "--:-:-:-:1      SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;\n",
+
+            j3c23 => "--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, P4;\n",
+
+
+            j3c25 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y1x0, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track01,    track01, -RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track10.CC, track10, -param_XNp;\n",
+
+            j3c26 => "--:-:-:-:1  \@P0 LDG.E.CI$dtype y1x0, [track0];\n",
+
+            j3c30 => "--:-:-:-:1 \@!P1 I2I.U32.U32 y1x1, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track11,    track11, -RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track20.CC, track20, -param_XNp;\n",
+
+            j3c31 => "--:-:-:-:1  \@P1 LDG.E.CI$dtype y1x1, [track1];\n",
+
+            j3c33 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n",
+
+            j3c35 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y1x2, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track21,    track21, -RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track30.CC, track30, -param_XNp;\n",
+
+            j3c36 => "--:-:-:-:1  \@P2 LDG.E.CI$dtype y1x2, [track2];\n",
+
+            j3c40 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y1x3, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track31,    track31, -RZ;\n",
+
+            j3c42 => "--:6:4:-:1  \@P3 LDG.E.CI$dtype y1x3, [track3];\n" .
+                     "--:-:-:Y:8      R2P PR, pred_bits, 0x0f;\n" .
+                     "20:-:-:-:1  \@P6 IADD   track00.CC, track00, param_2XNp;\n" .
+                     "--:-:-:-:1      SHF.L.U64 pred_bits, pred_bits, 12, pred_bits;\n",
+
+            j3c46 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y3x0, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track01,    track01, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track10.CC, track10, param_2XNp;\n",
+
+            j3c47 => "--:-:-:-:1  \@P0 LDG.E.CI$dtype y3x0, [track0];\n",
+
+            j3c51 => "--:-:-:-:1 \@!P1 I2I.U32.U32 y3x1, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track11,    track11, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track20.CC, track20, param_2XNp;\n",
+
+            j3c52 => "--:-:-:-:1  \@P1 LDG.E.CI$dtype y3x1, [track1];\n",
+
+            j3c56 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y3x2, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track21,    track21, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track30.CC, track30, param_2XNp;\n",
+
+            j3c57 => "--:-:-:-:1  \@P2 LDG.E.CI$dtype y3x2, [track2];\n",
+
+            j3c60 => "--:-:-:-:2 \@!P3 I2I.U32.U32 y3x3, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track31,    track31, RZ;\n",
+
+            j3c62 => "--:6:5:-:1  \@P3 LDG.E.CI$dtype y3x3, [track3];\n",
+
+            j3c63 => "--:-:-:Y:5  \@P4 BRA.U IMAGE_LOOP;\n",
+        )
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 3)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 4;
+        my $bankOffset = $IX ? 0 : 8;
+
+        my ($c0, $c2, $c4, $c6) = $j == 3 && !$IX ? (4,6,8,10) : (0,2,4,6);
+
+        $insert{"j${j}c$c0"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy0, [readIs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, $bankOffset;
+        $insert{"j${j}c$c2"} = sprintf "--:-:-:-:1      LDS.U.128 j%dEx0, [readEs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, 8;
+        $insert{"j${j}c$c4"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy4, [readIs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, $bankOffset;
+        $insert{"j${j}c$c6"} = sprintf "--:-:1:-:1      LDS.U.128 j%dEx4, [readEs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, 8;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1;
+
+            my $yield  = $j < 3 && $stall && ($c % 3 == 0) ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? $j == 2 && !$IX ? '03' : '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+[+
+    our $IX;
+    return $IX ? q{
+// Advance x offset/preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD gxs,    gxs,    param_strideX;
+--:-:-:-:1      IADD offset, offset, param_loopXI;
+
+01:-:-:-:1      BFE.U32 super_x, tid, param_superXI;
+--:-:-:-:1      SHL gx, gxs, param_shiftXI;
+
+--:-:-:-:1      BFE.U32 n, tid, param_superNI;
+</SCHEDULE_BLOCK>
+--:-:-:Y:d      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:0      IADD gx, gx, super_x;
+--:-:-:Y:5  @P5 BRA.U IMAGE_LOOP;
+
+// Advance y offset/preds
+--:-:-:-:1      IADD gys, gys, param_strideY;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n, param_N, P6;
+--:-:-:-:1      LDS gxs, [addr_blk_Q];
+--:-:-:-:1      BFE.U32 super_x, tid, param_superXI;
+--:-:-:-:1      PSETP.AND.AND P5, PT, PT, PT, PT;
+--:-:-:-:0      BFE.U32 super_y, tid, param_superYI;
+--:-:1:-:2      LDS blkC, [addr_blk_C];
+--:-:-:-:1      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+<SCHEDULE_BLOCK>
+01:-:-:-:1      SHL gx, gxs, param_shiftXI;
+--:-:-:-:1      SHL gy, gys, param_shiftYI;
+--:-:-:-:1      IADD gx, gx, super_x;
+--:-:-:-:1      IADD gy, gy, super_y;
+--:-:-:-:1      XMAD.U16.U16      offset, gx,   param_N,   n;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, gy,   param_XN,  offset;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, blkC, param_YXN, offset;
+</SCHEDULE_BLOCK>
+--:-:-:Y:5  @P6 BRA.U IMAGE_LOOP;
+
+// Set n to loop remaining times
+--:-:-:-:1      LOP.AND.NZ P5, init, pred_bits,  3;
+--:-:-:-:1      MOV nloop, param_loopN;
+--:-:-:-:1      MOV N,     param_N;
+--:-:-:Y:a      LOP.AND   pred_bits, pred_bits, ~3;
+--:-:-:-:0      VMAD.U16.U16 n, -init, nloop, N;
+--:-:-:Y:5  @P5 BRA.U IMAGE_LOOP;
+--:-:-:Y:5      BRA.U END_LOOP;
+    } : q{
+// Advance x offset/preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD gxs, gxs, param_strideX;
+--:-:-:-:1      IADD offset, offset, param_loopX;
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:1      SHL x, gxs, param_shiftX;
+01:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 n, tid, param_superN;
+--:-:-:-:1      ISCADD x, super_x,  x, 1;
+--:-:-:-:1      IADD x, x, -param_pad_x;
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      IADD x2, x, 2;
+--:-:-:-:1      IADD x3, x, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  param_X, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_X, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_X, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_X, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x0f;
+// Extract y + init + buffer bits
+--:-:-:-:1      BFE.U32 mask_y, pred_bits, 0x710;
+--:-:-:-:1      R2P PR, mask_y, 0x0f;
+--:-:-:-:1      SEL pred_bits, mask_x, RZ, P0;
+--:-:-:-:1  @P1 BFI pred_bits, mask_x, 0x404, pred_bits;
+--:-:-:-:1  @P2 BFI pred_bits, mask_x, 0x408, pred_bits;
+--:-:-:-:1  @P3 BFI pred_bits, mask_x, 0x40c, pred_bits;
+--:-:-:-:0      BFI pred_bits, mask_y, 0x710, pred_bits;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P5 BRA.U IMAGE_LOOP;
+
+// Advance y offset/preds
+--:-:-:-:1      IADD gys, gys, param_strideY;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n,  param_N, P6;
+--:-:-:-:1      LDS gxs, [addr_blk_Q];
+--:-:-:-:0      BFE.U32 init, pred_bits, 0x314;
+--:-:1:-:1      LDS blkC, [addr_blk_C];
+--:-:-:-:3      PSETP.AND.AND P5, PT, PT, PT, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+--:-:-:-:5      CAL IMAGE_OFFSET;
+--:-:-:-:0      BFI pred_bits, init, 0x314, pred_bits;
+--:-:-:Y:5  @P6 BRA.U IMAGE_LOOP;
+
+
+// Set n to loop remaining times
+--:-:-:-:1      SHR.U32 pred_bits, init, 2;
+--:-:-:-:1      MOV nloop, param_loopN;
+--:-:-:-:1      MOV N, param_N;
+--:-:-:Y:c      LOP.AND.NZ P5, init, init, 3;
+--:-:-:-:1      SHL pred_bits, pred_bits, 22;
+--:-:-:-:0      VMAD.U16.U16 n, -init, nloop, N;
+--:-:-:Y:5  @P5 BRA.U IMAGE_LOOP;
+--:-:-:Y:5      BRA.U END_LOOP;
+
+    };
++]
+
+
+ERROR_LOOP:
+
+[+
+    our ($dtype, $convert_in, $dtype_shift, $IX);
+    my %insert = (
+
+        $convert_in ? (
+            j1c13 => "02:-:2:-:1      $convert_in p0q0, p0q0;\n",
+            j1c17 => "04:-:3:-:1      $convert_in p0q1, p0q1;\n",
+            j1c21 => "08:-:4:-:1      $convert_in p1q1, p1q1;\n",
+            j1c25 => "10:-:5:-:1      $convert_in p1q0, p1q0;\n",
+        ) : (),
+
+        j1c23 => "02:-:-:-:1      FMUL e0,  p0q0, 0.5;\n",
+
+        j1c28 => "04:-:-:-:1      FFMA E01, p0q1,  0.5,  e0;\n" .
+                 "--:-:-:-:1      FFMA E02, p0q1, -0.5,  e0;\n",
+
+        j1c29 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*00 + 32>], E00;\n",
+        j1c31 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*01 + 32>], E01;\n",
+        j1c33 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*02 + 32>], E02;\n",
+        j1c35 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*03 + 32>], E03;\n",
+
+        j1c37 => "08:-:-:-:1      FMUL e1,  p1q1,  0.5;\n",
+
+        j1c42 => "10:-:-:-:1      FFMA E13, p1q0,  0.5,  e1;\n" .
+                 "--:-:-:-:1      FFMA E14, p1q0,  0.5, -e1;\n",
+
+        j1c43 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*12 + 32>], E12;\n",
+        j1c45 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*15 + 32>], E15;\n",
+        j1c47 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*13 + 32>], E13;\n",
+        j1c49 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*14 + 32>], E14;\n",
+
+        j1c51 => "--:-:-:-:1      FFMA B0,  p1q0,  0.5,  e0;\n" .
+                 "--:-:-:-:1      FFMA C0,  p1q0, -0.5,  e0;\n" .
+                 "--:-:-:-:1      FFMA B1,  p0q1,  0.5,  e1;\n" .
+                 "--:-:-:-:1      FFMA C1,  p0q1,  0.5, -e1;\n",
+
+        j2c9  => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*04 + 32>], E04;\n",
+        j2c11 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*08 + 32>], E08;\n",
+        j2c13 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*07 + 32>], E07;\n",
+        j2c15 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*11 + 32>], E11;\n",
+
+        j2c17 => "--:-:-:-:1      FMUL e2,  B0,  0.5;\n" .
+                 "--:-:-:-:1      FMUL e3,  C0,  0.5;\n",
+
+        j2c21 => "--:-:-:-:1      FFMA E05, B1,  0.5,  e2;\n" .
+                 "--:-:-:-:1      FFMA E06, B1, -0.5,  e2;\n" .
+                 "--:-:-:-:1      FFMA E09, C1,  0.5,  e3;\n" .
+                 "--:-:-:-:1      FFMA E10, C1, -0.5,  e3;\n",
+
+        j2c23 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*05 + 32>], E05;\n",
+        j2c25 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*06 + 32>], E06;\n",
+        j2c27 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*09 + 32>], E09;\n",
+        j2c29 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*10 + 32>], E10;\n",
+
+        j2c32 => "--:-:-:-:1      R2P PR, pred_bits, 0x0f;\n" .
+                 "--:-:-:-:1  \@P6 LEA    track00.CC, offset,  param_E[0], $dtype_shift;\n",
+
+        j2c37 => "--:-:-:-:1  \@P6 IADD.X track01,    RZ,      param_E[1];\n" .
+                 "--:-:-:-:1  \@P6 IADD   track10.CC, track00, param_Np;\n",
+
+        j2c42 => "--:-:-:-:1  \@P6 IADD.X track11,    track01, RZ;\n" .
+                 "--:-:-:-:1  \@P6 IADD   track20.CC, track00, param_QNp;\n",
+
+        j2c44 => "--:-:-:-:1      LOP.AND.NZ P4, RZ, pred_bits, 0x400;\n" .
+                 "--:-:-:-:1      LOP.XOR pred_bits, pred_bits, 0x400;\n",
+
+        j2c47 => "--:-:-:-:1  \@P6 IADD.X track21,    track01, RZ;\n" .
+                 "--:-:-:-:1  \@P6 IADD   track30.CC, track10, param_QNp;\n",
+
+        j2c52 => "--:-:-:-:1  \@P6 IADD.X track31,    track11, RZ;\n",
+
+        j2c61 => "--:-:-:-:1  \@P4 MOV swapBuf,  4x<(512*4 + 32)*2>;\n" .
+                 "--:-:-:-:1 \@!P4 MOV swapBuf, -4x<(512*4 + 32)*2>;\n",
+
+        j2c62 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readEs, readEs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeS, writeS,  swapBuf;\n",
+
+        j3c8  => "--:-:2:-:1  \@P0 LDG.E.CI$dtype p0q0, [track0];\n",
+        j3c10 => "--:-:3:-:1  \@P1 LDG.E.CI$dtype p0q1, [track1];\n",
+        j3c12 => "--:-:4:-:1  \@P3 LDG.E.CI$dtype p1q1, [track3];\n",
+        j3c14 => "--:6:5:-:1  \@P2 LDG.E.CI$dtype p1q0, [track2];\n",
+
+        j3c15 => "--:-:-:-:1      PSETP.OR.AND P4, PT, P5, P6, PT;\n" .
+                 "--:-:-:-:1      IADD n, n, param_loopN;\n" .
+                 "--:-:-:-:1      IADD offset, offset, param_loopN;\n",
+
+        j3c16 => "--:-:-:-:1 \@!P0 I2I.U32.U32 p0q0, RZ;\n",
+        j3c20 => "--:-:-:-:1 \@!P1 I2I.U32.U32 p0q1, RZ;\n",
+        j3c24 => "--:-:-:-:1 \@!P2 I2I.U32.U32 p1q0, RZ;\n",
+        j3c28 => "--:-:-:-:1 \@!P3 I2I.U32.U32 p1q1, RZ;\n",
+
+        j3c25 => "--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, P4;\n",
+
+
+        j3c38 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n",
+
+
+        j3c63 => "--:-:-:Y:5  \@P4 BRA.U ERROR_LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 3)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 4;
+        my $bankOffset = $IX ? 0 : 8;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy0, [readIs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, $bankOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1      LDS.U.128 j%dEx0, [readEs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, 8;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy4, [readIs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, $bankOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1      LDS.U.128 j%dEx4, [readEs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, 8;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1;
+
+            my $yield  = $j < 3 && $stall && ($c % 3 == 0) ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+// Advance x offset/preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD gxs, gxs, param_strideX;
+--:-:-:-:1      IADD offset, offset, param_loopX;
+// Extract y + init + buffer bits
+--:-:-:-:1      BFE.U32 mask_y, pred_bits, 0x704;
+--:-:-:-:1      R2P PR, mask_y, 0x0c;
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:1      SHL x, gxs, param_shiftX;
+01:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      ISCADD x, super_x,  x, 1;
+--:-:-:-:1      BFE.U32 n, tid, param_superN;
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  param_Q, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_Q, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x03;
+--:-:-:-:1      SEL pred_bits, mask_x, RZ, P2;
+--:-:-:-:1  @P3 BFI pred_bits, mask_x, 0x202, pred_bits;
+--:-:-:-:0      BFI pred_bits, mask_y, 0x704, pred_bits;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P5 BRA.U ERROR_LOOP;
+
+// Advance y offset/preds
+--:-:-:-:1      IADD gys, gys, param_strideY;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n,  param_N, P6;
+--:-:-:-:1      LDS gxs, [addr_blk_Q];
+--:-:-:-:0      BFE.U32 init, pred_bits, 0x308;
+--:-:1:-:1      LDS blkK, [addr_blk_K];
+--:-:-:-:2      PSETP.AND.AND P5, PT, PT, PT, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+--:-:-:-:5      CAL ERROR_OFFSET;
+--:-:-:-:0      BFI pred_bits, init, 0x308, pred_bits;
+--:-:-:Y:5  @P6 BRA.U ERROR_LOOP;
+
+// Set n to loop remaining times
+--:-:-:-:1      SHR.U32 pred_bits, init, 2;
+--:-:-:-:1      MOV nloop, param_loopN;
+--:-:-:-:1      MOV N, param_N;
+--:-:-:Y:c      LOP.AND.NZ P5, init, init, 3;
+--:-:-:-:1      SHL pred_bits, pred_bits, 10;
+--:-:-:-:0      VMAD.U16.U16 n, -init, nloop, N;
+--:-:-:Y:5  @P5 BRA.U ERROR_LOOP;
+
+END_LOOP:
+
+// K_blk, C_blk, P_blk, Q_blk
+--:-:1:-:1      LDS.U.128 blkKCPQ, [addr_blk_K];
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+
+// Strip double buffering offsets, and the batch dimension on readIs
+// This gives us the shared memory write mapping for the thread's registers:
+// readEs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readIs = ((tid &  8) >> 2)  | (tid & 1)
+--:-:-:-:1      LOP.AND  tid_16,  tid,   -16;
+--:-:-:-:1      SHR.U32  tid_16,  tid_16,  1;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid,    1;
+--:-:-:-:1      LOP.AND  readIs, tid,    8;
+--:-:-:-:1      SHR.U32  readIs, readIs, 2;
+--:-:-:-:1      LOP.OR   readIs, readIs, tid_1;
+--:-:-:-:1      SHL      readIs, readIs, 4;
+
+--:-:-:-:1      BFE.U32  readEs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readEs, readEs, tid_16;
+--:-:-:-:1      SHL      readEs, readEs, 4;
+
+// writeCs = readIs * 512 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 9;
+
+// readCs = tid//32 * 512 + tid & 31
+--:-:-:-:1      LOP.AND tid_31, tid, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid,  5;
+--:-:-:-:1      ISCADD  readCs, tid_32, tid_31, 9;
+--:-:-:-:1      SHL     readCs, readCs, 2;
+
+// kk = K_blk*32 + tid&31
+01:-:-:-:1      ISCADD  kk, K_blk, tid_31, 5;
+
+// cc = C_blk*32 + tid//32
+--:-:-:-:1      ISCADD  cc, C_blk, tid_32, 5;
+
+// F00 = c*RSK + r*SK + s*K + k
+--:-:-:-:1      XMAD.LO2C trackF, cc, param_RSK, kk;
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSK, param_CRSK;
+01:-:-:-:1      XMAD PQ_blk, P_blk,  param_strideX, Q_blk;
+--:-:-:-:1      XMAD.LO trackF, PQ_blk, CRSK, trackF, xmad_determ;
+        };
+    }
+    return '';
++]
+
+--:-:-:-:1      LEA      F00_0.CC, trackF, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X F00_1,    trackF, param_F[1], RZ, 2;
+
+--:-:-:-:1      MOV K1, param_K;
+--:-:-:-:1      SHL K1, K1, 2;
+
+--:-:-:-:1      MOV SK1, param_SK;
+--:-:-:-:1      SHL SK1, SK1, 2;
+
+--:-:-:-:1      MOV RSK8, param_RSK;
+--:-:-:-:1      SHL RSK8, RSK8, 5;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, kk, param_K, PT;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   F01_0.CC, F00_0, K1;
+--:-:-:-:1      IADD.X F01_1,    F00_1, RZ;
+--:-:-:-:6      IADD   F02_0.CC, F01_0, K1;
+--:-:-:-:1      IADD.X F02_1,    F01_1, RZ;
+
+--:-:-:-:6      IADD   F10_0.CC, F00_0, SK1;
+--:-:-:-:1      IADD.X F10_1,    F00_1, RZ;
+--:-:-:-:6      IADD   F11_0.CC, F01_0, SK1;
+--:-:-:-:1      IADD.X F11_1,    F01_1, RZ;
+--:-:-:-:6      IADD   F12_0.CC, F02_0, SK1;
+--:-:-:-:1      IADD.X F12_1,    F02_1, RZ;
+
+--:-:-:-:6      IADD   F20_0.CC, F10_0, SK1;
+--:-:-:-:1      IADD.X F20_1,    F10_1, RZ;
+--:-:-:-:6      IADD   F21_0.CC, F11_0, SK1;
+--:-:-:-:1      IADD.X F21_1,    F11_1, RZ;
+--:-:-:-:6      IADD   F22_0.CC, F12_0, SK1;
+--:-:-:-:1      IADD.X F22_1,    F12_1, RZ;
+
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*512 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*512 + 16>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*512 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*512 + 16>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:0      LOP.XOR readCs, readCs, 4x<8*512>;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*512 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*512 + 16>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*512 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*512 + 16>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:0      LOP.XOR readCs, readCs, 4x<8*512>;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:0      LOP.XOR readCs, readCs, 4x<8*512>;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+--:-:-:-:0      ISETP.LT.AND P1, PT, cc, param_C, P0; // cc < C && kk < K
+--:-:-:-:1      LDS m00, [readCs + 4x< 0*32>];
+--:-:-:-:1      LDS m10, [readCs + 4x< 4*32>];
+--:-:-:-:1      LDS m01, [readCs + 4x< 1*32>];
+--:-:1:-:1      LDS m11, [readCs + 4x< 5*32>];
+
+--:-:-:-:0      IADD cc, cc, 8;
+--:-:-:-:1      LDS m21, [readCs + 4x< 9*32>];
+--:-:-:-:1      LDS m02, [readCs + 4x< 2*32>];
+--:-:-:-:1      LDS m12, [readCs + 4x< 6*32>];
+--:-:2:-:1      LDS m22, [readCs + 4x<10*32>];
+
+--:-:-:-:1      LDS m31, [readCs + 4x<13*32>];
+--:-:-:-:1      LDS m20, [readCs + 4x< 8*32>];
+--:-:-:-:1      LDS m32, [readCs + 4x<14*32>];
+--:-:3:-:1      LDS m03, [readCs + 4x< 3*32>];
+
+--:-:-:-:1      LDS m13, [readCs + 4x< 7*32>];
+--:-:-:-:1      LDS m23, [readCs + 4x<11*32>];
+--:-:-:-:1      LDS m30, [readCs + 4x<12*32>];
+--:-:4:-:1      LDS m33, [readCs + 4x<15*32>];
+
+01:-:-:-:1      FADD t00, m00, m10;
+--:-:-:-:1      FADD t01, m01, m11;
+02:-:-:-:1      FADD t21, m11, m21;
+--:-:-:-:1      FADD t02, m02, m12;
+--:-:-:-:1      FADD t11, m11, -m21;
+--:-:-:-:1      FADD t22, m12, m22;
+--:-:-:-:1      FADD t12, m12, -m22;
+--:-:-:-:1      FADD t01, t01, m21;
+04:-:-:-:1      FADD t21, t21, m31;
+--:-:-:-:1      FADD t02, t02, m22;
+--:-:-:-:1      FADD t20, m10, m20;
+--:-:-:-:1      FADD t22, t22, m32;
+--:-:-:-:1      FADD t00, t00, m20;
+08:-:-:-:1      FADD t03, m03, m13;
+--:-:-:-:1      FADD t10, m10, -m20;
+--:-:-:-:1      FADD t23, m13, m23;
+--:-:-:-:1      FADD t20, t20, m30;
+--:-:-:-:1      FADD t13, m13, -m23;
+--:-:-:-:1      FADD f00, t00, t01;
+--:-:-:-:1      FADD t03, t03, m23;
+--:-:-:-:1      FADD f02, t01, t02;
+--:-:-:-:1      FADD t23, t23, m33;
+--:-:-:-:1      FADD f10, t10, t11;
+--:-:-:-:1      FADD f12, t11, t12;
+--:-:-:-:1      FADD f20, t20, t21;
+--:-:-:-:1      FADD f22, t21, t22;
+--:-:-:-:1      FADD f00, f00, t02;
+--:-:-:-:1      FADD f01, t01, -t02;
+--:-:-:-:0      FADD f02, f02, t03;
+--:-:-:-:1  @P1 [+ output_op() +] [F00_0], f00;
+--:-:-:-:0      FADD f10, f10, t12;
+--:-:-:-:1  @P1 [+ output_op() +] [F01_0], f01;
+--:-:-:-:0      FADD f11, t11, -t12;
+--:1:-:-:1  @P1 [+ output_op() +] [F02_0], f02;
+--:-:-:-:0      FADD f12, f12, t13;
+--:-:-:-:1  @P1 [+ output_op() +] [F10_0], f10;
+--:-:-:-:0      FADD f20, f20, t22;
+--:-:-:-:1  @P1 [+ output_op() +] [F11_0], f11;
+--:-:-:-:0      FADD f21, t21, -t22;
+--:2:-:-:1  @P1 [+ output_op() +] [F12_0], f12;
+--:-:-:-:0      FADD f22, f22, t23;
+--:-:-:-:1  @P1 [+ output_op() +] [F20_0], f20;
+--:-:-:-:1  @P1 [+ output_op() +] [F21_0], f21;
+--:3:-:-:1  @P1 [+ output_op() +] [F22_0], f22;
+
+01:-:-:-:6      IADD   F00_0.CC, F00_0, RSK8;
+--:-:-:-:1      IADD.X F00_1,    F00_1, RZ;
+--:-:-:-:6      IADD   F01_0.CC, F01_0, RSK8;
+--:-:-:-:1      IADD.X F01_1,    F01_1, RZ;
+--:-:-:-:6      IADD   F02_0.CC, F02_0, RSK8;
+--:-:-:-:1      IADD.X F02_1,    F02_1, RZ;
+02:-:-:-:6      IADD   F10_0.CC, F10_0, RSK8;
+--:-:-:-:1      IADD.X F10_1,    F10_1, RZ;
+--:-:-:-:6      IADD   F11_0.CC, F11_0, RSK8;
+--:-:-:-:1      IADD.X F11_1,    F11_1, RZ;
+--:-:-:-:6      IADD   F12_0.CC, F12_0, RSK8;
+--:-:-:-:1      IADD.X F12_1,    F12_1, RZ;
+04:-:-:-:6      IADD   F20_0.CC, F20_0, RSK8;
+--:-:-:-:1      IADD.X F20_1,    F20_1, RZ;
+--:-:-:-:6      IADD   F21_0.CC, F21_0, RSK8;
+--:-:-:-:1      IADD.X F21_1,    F21_1, RZ;
+--:-:-:-:6      IADD   F22_0.CC, F22_0, RSK8;
+--:-:-:-:0      IADD.X F22_1,    F22_1, RZ;
+
+--:-:-:-:5      RET;
+
diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_3x3_4x4_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_3x3_4x4_32x32.sass
new file mode 100644
index 0000000..20e8a9d
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_winograd_3x3_4x4_32x32.sass
@@ -0,0 +1,1047 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our ($type, $D);
+our $determ = $D;
+our $convert_in   = $type eq 'h' ? 'F2F.F32.F16' :    '';
+our $convert_out  = $type eq 'h' ? 'F2F.F16.F32' :    '';
+our $dtype_shift  = $type eq 'h' ?           '1' :   '2';
+our $dtype_size   = $type eq 'h' ?           '2' :   '4';
+our $vec_size     = $type eq 'h' ?          '64' : '128';
+sub dtype_shift { return $dtype_shift; }
+sub vec_size    { return $vec_size;    }
+sub output_op   { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero   : 4x<32*36*2*4 + 64 + 0>
+    addr_rYXN   : 4x<32*36*2*4 + 64 + 4>
+    addr_iYXN   : 4x<32*36*2*4 + 64 + 5>
+    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>
+    addr_idx_C  : 4x<32*36*2*4 + 64 + 7>
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_K            : c[0x0][0x15c]
+    param_C            : c[0x0][0x160]
+    param_k            : c[0x0][0x164]
+    param_c            : c[0x0][0x168]
+    param_kc           : c[0x0][0x16c]
+    param_magic_kc     : c[0x0][0x170]
+    param_shift_kc     : c[0x0][0x174]
+    param_magic_c      : c[0x0][0x178]
+    param_shift_c      : c[0x0][0x17c]
+    param_YXN2         : c[0x0][0x180]
+    param_sYXN         : c[0x0][0x184]
+    param_magic_sYXN   : c[0x0][0x188]
+    param_shift_sYXN   : c[0x0][0x18c]
+    param_stride_YXNp  : c[0x0][0x190]
+    param_YXN          : c[0x0][0x194]
+    param_YXN_1152     : c[0x0][0x198]
+    param_RSK          : c[0x0][0x19c]
+    param_CRSK         : c[0x0][0x1a0]
+    param_Kp           : c[0x0][0x1a4]
+    param_SKp          : c[0x0][0x1a8]
+    param_RSK15_SK2p   : c[0x0][0x1ac]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10 : clx<0-3>y0
+     7, 6,15,14 : clx<0-3>y1
+     1, 0, 9, 8 : clx<0-3>y2
+     5, 4,13,12 : clx<0-3>y3
+    19,18,27,26 : clx<0-3>y4
+    23,22,31,30 : clx<0-3>y5
+    17,16,25,24 : clx<0-3>y6
+    21,20,29,28 : clx<0-3>y7
+
+      32-43 : jl0Fx<0-3>, jl0Iy<0-7>
+      44-51 : jl1Fx<0-3>, jl1Iy<4-7>
+      36-39 : jl1Iy<0-3>
+
+      52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3>
+      88-89 : track<0-1>
+      90-91 ~ writeS
+
+      32-86 ~ idx_YXNkc, idx_K, idx_C, idx_YXN, div<1-3>, magic_kc, neg_kc, idx_kc, idx_k, idx_c, YXN2_idx, neg_sYXN, magic_sYXN, remainder, yxn, offset, offset2, tid32_2, tid1, tid31
+         87 = tid
+
+      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
+      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16
+
+
+     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
+     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
+     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
+    35,34,43,42,51,50,59,58 : ccx<0-7>y4
+    39,38,47,46,55,54,63,62 : ccx<0-7>y5
+    33,32,41,40,49,48,57,56 : ccx<0-7>y6
+    37,36,45,44,53,52,61,60 : ccx<0-7>y7
+
+      64-79 : jc0Fx<0-7>, jc0Iy<0-7>
+      80-91 : jc1Fx<4-7>, jc1Iy<0-7>
+      64-67 : jc1Fx<0-3>
+
+      64-86 ~ tid16, tid_1, tid128
+
+      92-95 ~ reduce_YXN, swapBuf, readFs, readIs
+
+
+      64-89 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxC, idxK, idxI, readFs2, readIs2, offsetF, k, CRSK, xmad_determ
+      86-89 : Out1<0-1>, Out2<0-1>
+      90-91 : Out0<0-1>
+      92-95 ~ alpha, writeCs, readCs, c
+
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+
+      84-85 ~ t<0-1>
+
+       3, 2,11,19,10,18 : m<0-5>0
+       1, 9, 0, 8,17,16 : m<0-5>1
+      27,26,25,24,64,65 : m<0-5>2
+      66,67,68,69,70,71 : m<0-5>3
+      72,73,74,75,76,77 : m<0-5>4
+      78,79,80,81,82,83 : m<0-5>5
+
+       3, 2,11 : w00, w10, w20
+       1, 9, 0 : w01, w11, w21
+      27,26,25 : w02, w12, w22
+      66,67,68 : w03, w13, w23
+      72,73,74 : w04, w14, w24
+      78,79,80 : w05, w15, w25
+
+      19,10,18,69,70,71 ~ s00, s10, s20
+       8,17,16,75,76,77 ~ s02, s12, s22
+      24,64,65,81,82,83 ~ s01, s11, s21
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV swapBuf, 4x<32*36*2*2>;
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+01:-:-:Y:d      ISETP.GE.AND P0, PT, tid, 128, PT;
+--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;
+
+##############################################################
+LOAD_SETUP:
+
+--:-:1:-:1      S2R idx_YXNkc, SR_CTAID.X;
+--:-:2:-:1      S2R idx_K,     SR_CTAID.Z;
+--:-:3:-:1      S2R idx_C,     SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]
+
+--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;
+
+// idx_YXN = idx_YXNkc / blk_kc
+--:-:-:-:1      MOV  magic_kc, param_magic_kc;
+--:-:-:-:1      IADD neg_kc, RZ, -param_kc;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magic_kc, 1, PT;
+01:-:-:-:1  @P2 XMAD     div1, idx_YXNkc,    magic_kc,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, idx_YXNkc,    magic_kc.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, idx_YXNkc.H1, magic_kc.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, idx_YXNkc.H1, magic_kc,    div1;
+--:-:-:-:1  @P2 IADD3.RS idx_YXN, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  idx_YXN, idx_YXN,   param_shift_kc;
+--:-:-:-:1 @!P2 SHR.U32  idx_YXN, idx_YXNkc, param_shift_kc;
+
+// idx_kc = idx_YXNkc % blk_kc
+--:-:-:-:1      XMAD.LO2 idx_kc, neg_kc, idx_YXN, idx_YXNkc;
+
+// idx_k = idx_kc / blk_c
+// idx_c = idx_kc % blk_c
+--:-:-:-:1      XMAD    idx_k,  idx_kc, param_magic_c, RZ;
+--:-:-:-:1      SHR.U32 idx_k,  idx_k,  param_shift_c;
+--:-:-:-:1      XMAD    idx_c,  idx_k,  param_c, RZ;
+--:-:-:-:1      IADD    idx_c, -idx_c,  idx_kc;
+
+// idx_K = idx_K * blk_k + idx_k
+// idx_C = idx_C * blk_c + idx_c
+02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+04:-:-:-:1      XMAD idx_C, idx_C, param_c, idx_c;
+
+// reduce_YXN  = ceil((YXN2 - idx_YXN) / sYXN)
+--:-:-:-:1      IADD YXN2_idx, -idx_YXN, param_YXN2;
+--:-:-:-:1      IADD neg_sYXN, RZ, -param_sYXN;
+--:-:-:-:1      MOV  magic_sYXN, param_magic_sYXN;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_sYXN, 1, PT;
+--:-:-:-:1  @P3 XMAD     div1, YXN2_idx,    magic_sYXN,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, YXN2_idx,    magic_sYXN.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, YXN2_idx.H1, magic_sYXN.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, YXN2_idx.H1, magic_sYXN,    div1;
+--:-:-:-:1  @P3 IADD3.RS reduce_YXN, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  reduce_YXN, reduce_YXN, param_shift_sYXN;
+--:-:-:-:1 @!P3 SHR.U32  reduce_YXN, YXN2_idx,   param_shift_sYXN;
+
+--:-:-:-:1      XMAD.LO2  remainder, neg_sYXN, reduce_YXN, YXN2_idx;
+--:-:-:-:1      IMNMX.U32 remainder, remainder, 1, PT;
+--:-:-:-:1      IADD reduce_YXN, reduce_YXN, remainder;
+
+--:-:-:-:1  @P0 STS [addr_iYXN],  idx_YXN;
+--:-:-:-:1  @P0 STS [addr_idx_K], idx_K;
+--:-:-:-:1  @P0 STS [addr_idx_C], idx_C;
+--:6:-:-:1  @P0 STS [addr_rYXN],  reduce_YXN;
+
+// yxn = (tid & 63) >> 5
+--:-:-:-:1      BFE.U32 yxn, tid, 0x105; // 1 bit at position 5
+
+// offset = (idx_YXN + (reduce_YXN - 1)*sYXN)*2 + yxn
+--:-:-:-:1      IADD     offset,  reduce_YXN, -1;
+--:-:-:-:1      XMAD     offset2, offset,    param_sYXN, idx_YXN;
+--:-:-:-:1      XMAD.PSL offset2, offset.H1, param_sYXN, offset2;
+--:-:-:-:1      ISCADD   offset2, offset2,   yxn, 1;
+
+// P6 = offset < YXN
+--:-:-:-:1      ISETP.LT.AND P6, PT, offset2, param_YXN, PT;
+
+// P5 = reduce_YXN > 1
+--:-:-:-:1      ISETP.GT.AND P5, PT, reduce_YXN, 1, PT;
+
+--:-:-:-:1      LOP.AND  tid32_2,  tid,    -32;
+--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;
+
+// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
+--:-:-:-:1      BFE.U32 readFs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readFs, readFs, tid32_2;
+--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;
+
+// readIs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readIs, tid,    16;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      IADD3   readIs, readIs, tid1, tid32_2;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// writeS = (yxn*32*36 + (tid & 31)*4)*4
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      SHL writeS, tid31, 4;
+--:-:-:-:1      XMAD writeS, yxn, 4x<32*36>, writeS;
+
+// offset = offset*32*36 + tid31*4
+--:-:-:-:1      SHL tid31, tid31, 2;
+--:-:-:-:0      XMAD.LO2 offset, offset2, 1x<32*36>, tid31;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6  @P1 BRA.U FILTER_SETUP;
+
+##############################################################
+IMAGE_SETUP:
+
+<SCHEDULE_BLOCK>
+// (GC32,GY,GX,N,6,6,32)
+// offset += idx_C * YXN*32*36
+--:-:-:-:1      XMAD.LO2C offset, idx_C, param_YXN_1152, offset;
+
+--:-:-:-:1      LEA      track0.CC, offset, param_I[0],     [+ dtype_shift() +];
+--:-:-:-:0      LEA.HI.X track1,    offset, param_I[1], RZ, [+ dtype_shift() +];
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      BRA.U LOAD;
+
+##############################################################
+FILTER_SETUP:
+
+<SCHEDULE_BLOCK>
+// writeS += 32*36*2*4
+--:-:-:-:1      IADD writeS, writeS, 4x<32*36*2>;
+
+// (GK32,GY,GX,N,6,6,32)
+// offset += idx_K * YXN*32*36
+--:-:-:-:1      XMAD.LO2C offset, idx_K, param_YXN_1152, offset;
+
+--:-:-:-:1      LEA      track0.CC, offset, param_E[0],     [+ dtype_shift() +];
+--:-:-:-:2      LEA.HI.X track1,    offset, param_E[1], RZ, [+ dtype_shift() +];
+</SCHEDULE_BLOCK>
+
+##############################################################
+LOAD:
+
+20:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>];
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>];
+--:-:2:-:1  @P6 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T0, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T1, [addr_zero];
+--:-:2:-:1 @!P6 LDS.U.[+ vec_size() +] T2, [addr_zero];
+
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>];
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>];
+--:-:3:-:1  @P6 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T3, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T4, [addr_zero];
+--:-:3:-:1 @!P6 LDS.U.[+ vec_size() +] T5, [addr_zero];
+
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>];
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>];
+--:-:4:-:1  @P6 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T6, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T7, [addr_zero];
+--:-:4:-:1 @!P6 LDS.U.[+ vec_size() +] T8, [addr_zero];
+
+[+
+    our $convert_in;
+    return $convert_in ? q{
+
+02:-:-:-:1      F2F.F32.F16 T03, T01.H1;
+--:-:-:-:1      F2F.F32.F16 T02, T01.H0;
+--:-:-:-:1      F2F.F32.F16 T01, T00.H1;
+--:-:2:-:1      F2F.F32.F16 T00, T00.H0;
+
+--:-:-:-:1      F2F.F32.F16 T13, T11.H1;
+--:-:-:-:1      F2F.F32.F16 T12, T11.H0;
+--:-:-:-:1      F2F.F32.F16 T11, T10.H1;
+--:-:5:-:1      F2F.F32.F16 T10, T10.H0;
+
+--:-:-:-:1      F2F.F32.F16 T23, T21.H1;
+--:-:-:-:1      F2F.F32.F16 T22, T21.H0;
+--:-:-:-:1      F2F.F32.F16 T21, T20.H1;
+--:-:6:-:1      F2F.F32.F16 T20, T20.H0;
+
+02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
+
+04:-:-:-:1      F2F.F32.F16 T33, T31.H1;
+--:-:-:-:1      F2F.F32.F16 T32, T31.H0;
+--:-:-:-:1      F2F.F32.F16 T31, T30.H1;
+--:-:3:-:1      F2F.F32.F16 T30, T30.H0;
+
+10:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
+
+--:-:-:-:1      F2F.F32.F16 T43, T41.H1;
+--:-:-:-:1      F2F.F32.F16 T42, T41.H0;
+--:-:-:-:1      F2F.F32.F16 T41, T40.H1;
+--:-:5:-:1      F2F.F32.F16 T40, T40.H0;
+
+20:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
+
+--:-:-:-:1      F2F.F32.F16 T53, T51.H1;
+--:-:-:-:1      F2F.F32.F16 T52, T51.H0;
+--:-:-:-:1      F2F.F32.F16 T51, T50.H1;
+--:-:6:-:1      F2F.F32.F16 T50, T50.H0;
+
+04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
+
+08:-:-:-:1      F2F.F32.F16 T63, T61.H1;
+--:-:-:-:1      F2F.F32.F16 T62, T61.H0;
+--:-:-:-:1      F2F.F32.F16 T61, T60.H1;
+--:-:4:-:1      F2F.F32.F16 T60, T60.H0;
+
+10:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
+
+--:-:-:-:1      F2F.F32.F16 T73, T71.H1;
+--:-:-:-:1      F2F.F32.F16 T72, T71.H0;
+--:-:-:-:1      F2F.F32.F16 T71, T70.H1;
+--:-:5:-:1      F2F.F32.F16 T70, T70.H0;
+
+20:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
+
+--:-:-:-:1      F2F.F32.F16 T83, T81.H1;
+--:-:-:-:1      F2F.F32.F16 T82, T81.H0;
+--:-:-:-:1      F2F.F32.F16 T81, T80.H1;
+--:-:6:-:1      F2F.F32.F16 T80, T80.H0;
+
+08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
+10:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
+20:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
+
+    } : q{
+02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
+--:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
+--:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
+04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
+--:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
+--:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
+08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
+--:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
+--:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
+    };
++]
+
+--:-:-:-:0      IADD   track0.CC, track0, -param_stride_YXNp;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:1      LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>];
+--:-:1:-:1      LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>];
+
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>];
+--:-:2:-:1  @P5 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>];
+--:-:3:-:1  @P5 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>];
+--:6:4:-:1  @P5 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>];
+
+--:-:-:-:5      BRA.U LOAD_LOOP;
+
+##############################################################
+
+COMPUTE_SETUP:
+
+<SCHEDULE_BLOCK>
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+--:-:-:-:1      IADD tid128, tid, -128;
+
+// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readIs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,   1;
+
+--:-:-:-:1      BFE.U32  readFs, tid128, 0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readFs, readFs, tid16;
+--:-:-:-:1      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2>, 4;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
+--:-:-:-:1      LOP.AND  readIs, tid128, 8;
+--:-:-:-:1      SHR.U32  readIs, readIs, 2;
+--:-:-:-:1      IADD3    readIs, readIs, tid16, tid_1;
+--:-:-:-:0      ISCADD   readIs, readIs, 4x<32*4>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS reduce_YXN, [addr_rYXN];
+
+--:-:-:-:1      LDS.U.128 jc0Iy0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Fx0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Iy4, [readIs + 4x<0*32*36 + 16>];
+--:-:1:-:2      LDS.U.128 jc0Fx4, [readFs + 4x<0*32*36 + 16>];
+
+COMPUTE_LOOP:
+[+
+    my %insert = (
+
+        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, reduce_YXN, 1, PT;\n" .
+                 "--:-:-:-:1      IADD reduce_YXN, reduce_YXN, -1;\n",
+
+        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD readFs, readFs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,   -swapBuf;\n",
+
+        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        my $odd    = $j;
+        my $nOdd   = 1 - $j;
+        my $rsPred = $j == 1 ? '@P0' : '   ';
+        my $bar    = $j == 0 ? '2' : '-';
+
+        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIy4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFx4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIy0, [readIs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;
+
+        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dFx0, [readFs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;
+
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $yield  = $c % 10 == 0 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dFx%d, jc%dIy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+LOAD_LOOP:
+--:-:-:-:1      ISETP.GT.AND P0, PT, reduce_YXN, 1, PT;
+20:-:-:-:1      IADD track0.CC, track0, -param_stride_YXNp;
+--:-:-:-:1      ISETP.GT.AND P1, PT, reduce_YXN, 2, PT;
+--:-:-:-:1      IADD reduce_YXN, reduce_YXN, -1;
+[+
+    our ($vec_size, $dtype_size, $convert_in);
+    my %insert = (
+
+        j0c3 => "--:-:-:-:1      IADD.X track1, track1, -RZ;\n",
+
+        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Iy4, [readIs + 4x<1*32*36 + 16>];\n",
+        j0c2  => "--:-:-:-:1      LDS.U.128 jl1Fx0, [readFs + 4x<1*32*36 + 00>];\n",
+        j0c18 => "--:-:1:-:1      LDS.U.128 jl1Iy0, [readIs + 4x<1*32*36 + 00>];\n",
+
+        j1c12 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>];\n",
+        j1c14 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>];\n",
+        j1c16 => "--:-:1:-:1  \@P0 LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>];\n",
+
+        $convert_in ? (
+
+            j0c1  => "02:-:-:-:1      F2F.F32.F16 T03, T01.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T02, T01.H0;\n",
+            j0c4  => "--:-:-:-:1      F2F.F32.F16 T01, T00.H1;\n" .
+                     "--:-:2:-:1      F2F.F32.F16 T00, T00.H0;\n",
+
+            j0c5  => "--:-:-:-:1      F2F.F32.F16 T13, T11.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T12, T11.H0;\n",
+            j0c6  => "--:-:-:-:1      F2F.F32.F16 T11, T10.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T10, T10.H0;\n",
+
+            j0c7  => "--:-:-:-:1      F2F.F32.F16 T23, T21.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T22, T21.H0;\n",
+            j0c8  => "--:-:-:-:1      F2F.F32.F16 T21, T20.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T20, T20.H0;\n",
+
+            j0c9  => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n",
+            j0c10 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
+            j0c11 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",
+
+            j0c13 => "02:-:-:-:1  \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n",
+            j0c14 => "10:-:-:-:1  \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n",
+            j0c15 => "20:-:2:-:1  \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n",
+
+            j0c16 => "04:-:-:-:1      F2F.F32.F16 T33, T31.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T32, T31.H0;\n",
+            j0c17 => "--:-:-:-:1      F2F.F32.F16 T31, T30.H1;\n" .
+                     "--:-:3:-:1      F2F.F32.F16 T30, T30.H0;\n",
+
+            j0c19 => "--:-:-:-:1      F2F.F32.F16 T43, T41.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T42, T41.H0;\n",
+            j0c20 => "--:-:-:-:1      F2F.F32.F16 T41, T40.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T40, T40.H0;\n",
+
+            j0c21 => "--:-:-:-:1      F2F.F32.F16 T53, T51.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T52, T51.H0;\n",
+            j0c22 => "--:-:-:-:1      F2F.F32.F16 T51, T50.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T50, T50.H0;\n",
+
+            j0c23 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
+            j0c24 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
+            j0c25 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",
+
+            j0c27 => "04:-:-:-:1  \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n",
+            j0c28 => "10:-:-:-:1  \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n",
+            j0c29 => "20:-:3:-:1  \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n",
+
+            j0c30 => "08:-:-:-:1      F2F.F32.F16 T63, T61.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T62, T61.H0;\n",
+            j0c31 => "--:-:-:-:1      F2F.F32.F16 T61, T60.H1;\n" .
+                     "--:-:4:-:1      F2F.F32.F16 T60, T60.H0;\n",
+
+            j1c0  => "--:-:-:-:1      F2F.F32.F16 T73, T71.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T72, T71.H0;\n",
+            j1c1  => "--:-:-:-:1      F2F.F32.F16 T71, T70.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T70, T70.H0;\n",
+
+            j1c2  => "--:-:-:-:1      F2F.F32.F16 T83, T81.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T82, T81.H0;\n",
+            j1c3  => "--:-:-:-:1      F2F.F32.F16 T81, T80.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T80, T80.H0;\n",
+
+            j1c4  => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
+            j1c5  => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
+            j1c6  => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",
+
+            j1c8  => "08:-:-:-:1  \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n",
+            j1c9  => "10:-:-:-:1  \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n",
+            j1c10 => "20:6:4:-:1  \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n",
+
+        ) : (
+
+            j0c6  => "02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;\n",
+            j0c8  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
+            j0c10 => "--:2:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",
+
+            j0c12 => "02:-:-:-:1  \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n",
+            j0c14 => "--:-:-:-:1  \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n",
+            j0c16 => "--:-:2:-:1  \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n",
+
+            j0c20 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
+            j0c22 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
+            j0c24 => "--:3:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",
+
+            j0c26 => "04:-:-:-:1  \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n",
+            j0c28 => "--:-:-:-:1  \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n",
+            j0c30 => "--:-:3:-:1  \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n",
+
+            j1c0  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
+            j1c2  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
+            j1c4  => "--:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",
+
+            j1c6  => "08:-:-:-:1  \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n",
+            j1c8  => "--:-:-:-:1  \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n",
+            j1c10 => "--:6:4:-:1  \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n",
+        ),
+
+        j1c11 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c31 => "--:-:-:Y:5  \@P0 BRA.U LOAD_LOOP;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "01" : '--';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dFx%d, jl%dIy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+--:-:1:-:2      S2R Tid, SR_TID.X;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha16, param_alpha;
+
+01:-:-:-:1      LOP.AND  Tid32_2,  Tid,    -32;
+--:-:-:-:1      SHR.U32  Tid32_2,  Tid32_2, 2;
+
+// readIs = ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND Tid1,   Tid,    1;
+01:-:-:-:1      LOP.AND readIs, Tid,    16;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      IADD    readIs, readIs, Tid1;
+
+// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readIs << 2)
+--:-:-:-:1      BFE.U32 readFs, Tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readFs, readFs, Tid32_2;
+--:-:-:-:1      ISCADD  readFs, readIs, readFs, 2;
+
+--:-:-:-:1      SHL     readFs, readFs, 4;
+--:-:-:-:1      SHL     readIs, readIs, 3;
+
+// writeCs = readIs * 32*36 + readFs;
+--:-:-:-:1      XMAD write16Cs, readIs, 1x<32*36>, readFs;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y2, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y2, alpha16;
+--:-:-:-:4      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y3, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y3, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y6, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y6, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y7, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y7, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      EXIT;
+
+COMPUTE_FINISH:
+
+--:-:1:-:2      S2R tid_128, SR_TID.X;
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+01:-:-:-:1      IADD tid_128, tid_128, -128;
+
+--:-:-:-:1      ISETP.GE.AND P4, PT, tid_128, 256, PT;
+
+// readIs = ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  Tid_1,   tid_128, 1;
+--:-:-:-:1      LOP.AND  readIs2, tid_128, 8;
+--:-:-:-:1      SHR.U32  readIs2, readIs2, 2;
+--:-:-:-:1      IADD     readIs2, readIs2, Tid_1;
+
+// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readIs2 << 2)
+--:-:-:-:1      LOP.AND  tid_16,   tid_128, -16;
+--:-:-:-:1      SHR.U32  tid_16,   tid_16,   1;
+--:-:-:-:1      BFE.U32  readFs2,  tid_128,  0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readFs2,  readFs2,  tid_16;
+--:-:-:-:1      ISCADD   readFs2,  readIs2, readFs2, 2;
+
+--:-:-:-:1      ISCADD   readFs2, readFs2,  4x<32*4>, 4;
+--:-:-:-:1      SHL      readIs2, readIs2, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:0      XMAD writeCs, readIs2, 1x<32*36>, readFs2;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P4 BRA.U SKIP0;
+
+--:-:1:-:1      LDS idxK, [addr_idx_K];
+--:-:2:-:1      LDS idxC, [addr_idx_C];
+[+ our $determ; return $determ ? q{--:-:3:-:1      LDS idxI, [addr_iYXN];} : ''; +]
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND tid_31, tid_128, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid_128,  5;
+--:-:-:-:1      SHR.U32 tid_64, tid_128,  6;
+
+// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16
+--:-:-:-:1      XMAD   readCs, tid_32, 1x<32*36>, tid_31;
+--:-:-:-:1      ISCADD readCs, tid_64, readCs, 4;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// k = K_blk*32 + tid_31
+// c = C_blk*32 + tid_32<<1
+--:-:-:-:1      SHL tid_32, tid_32, 1;
+01:-:-:-:1      ISCADD  k, idxK, tid_31, 5;
+02:-:-:-:1      ISCADD  c, idxC, tid_32, 5;
+
+
+// offsetF = c*RSK + r*SK + s*K + k
+--:-:-:-:1      XMAD.LO2C offsetF, c, param_RSK, k;
+
+[+
+    our $determ;
+    return $determ ? q{
+--:-:-:-:1      MOV CRSK, param_CRSK;
+04:-:-:-:1      XMAD.LO offsetF, idxI, CRSK, offsetF, xmad_determ;
+    } : '';
++]
+
+--:-:-:-:1      LEA      Out00.CC, offsetF, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X Out01,    offsetF, param_F[1], RZ, 2;
+
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K, PT;
+</SCHEDULE_BLOCK>
+
+SKIP0:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD     Out10.CC, Out00, param_Kp;
+--:-:-:-:1      IADD.X   Out11,    Out01, RZ;
+--:-:-:-:1      IADD     Out20.CC, Out10, param_Kp;
+--:-:-:-:1      IADD.X   Out21,    Out11, RZ;
+
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, ccx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, ccx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, ccx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP1;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+01:-:-:-:5      IADD   Out00.CC, Out00, param_SKp;
+--:-:-:-:1      IADD c, c, 1;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+
+SKIP1:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y1, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP2;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+01:-:-:-:5      IADD   Out00.CC, Out00, param_RSK15_SK2p;
+--:-:-:-:1      IADD c, c, 15;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_RSK15_SK2p;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_RSK15_SK2p;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+
+SKIP2:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y4, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y6, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP3;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+01:-:-:-:5      IADD   Out00.CC, Out00, param_SKp;
+--:-:-:-:1      IADD c, c, 1;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+
+SKIP3:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y5, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP4;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP4:
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+--:-:-:-:0      ISETP.LT.AND P1, PT, c, param_C, P0;
+
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $j == 5 ? $i + 1 : '-';
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+$w:-:-:-:1      FADD t0,   m1$i,  m2$i;
+$w:-:-:-:1      FADD t1,   m3$i,  m4$i;
+--:-:-:-:1      FADD m1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD m3$i, m3$i, -m4$i;
+--:-:-:-:1      FADD w0$i, m0$i,  t0;
+--:-:-:-:1      FADD w0$i, w0$i,  t1;
+--:-:-:-:1      FMUL w1$i, m1$i,  0.625;
+--:-:-:-:1      FFMA w1$i, m3$i,  1.5,      w1$i;
+--:-:-:-:1      FFMA w2$i, t1,    2.25,     m5$i;
+--:-:-:-:1      FFMA w2$i, t0,    0.390625, w2$i;
+        };
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (0 .. 2)
+    {
+        $out .= qq{
+--:-:-:-:1      FADD t0,     w${i}1,  w${i}2;
+--:-:-:-:1      FADD t1,     w${i}3,  w${i}4;
+--:-:-:-:1      FADD w${i}1, w${i}1, -w${i}2;
+--:-:-:-:1      FADD w${i}3, w${i}3, -w${i}4;
+--:-:-:-:1      FADD s${i}0, w${i}0,  t0;
+--:-:-:-:1      FADD s${i}0, s${i}0,  t1;
+--:-:-:-:1      FMUL s${i}1, w${i}1,  0.625;
+--:-:-:-:1      FFMA s${i}1, w${i}3,  1.5,      s${i}1;
+--:-:-:-:1      FFMA s${i}2, t1,      2.25,     w${i}5;
+--:-:-:-:1      FFMA s${i}2, t0,      0.390625, s${i}2;
+        };
+    }
+    return $out;
++]
+
+//--:-:1:-:1      I2F.F32.S32 temp, c;
+
+<ORDERED>
+--:1:-:-:1  @P1 [+ output_op() +] [Out0], s00;
+--:2:-:-:1  @P1 [+ output_op() +] [Out1], s01;
+--:3:-:-:1  @P1 [+ output_op() +] [Out2], s02;
+01:-:-:-:6      IADD   Out00.CC, Out00, param_SKp;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+</ORDERED>
+
+<ORDERED>
+--:1:-:-:1  @P1 [+ output_op() +] [Out0], s10;
+--:2:-:-:1  @P1 [+ output_op() +] [Out1], s11;
+--:3:-:-:1  @P1 [+ output_op() +] [Out2], s12;
+01:-:-:-:6      IADD   Out00.CC, Out00, param_SKp;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+</ORDERED>
+
+<ORDERED>
+--:1:-:-:1  @P1 [+ output_op() +] [Out0], s20;
+--:2:-:-:1  @P1 [+ output_op() +] [Out1], s21;
+--:3:-:-:1  @P1 [+ output_op() +] [Out2], s22;
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32.sass b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32.sass
new file mode 100644
index 0000000..d4b2941
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32.sass
@@ -0,0 +1,1237 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $type;
+our $dtype       = $type eq 'h' ?         'U16' :  '32';
+our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' :    '';
+our $convert_out = $type eq 'h' ? 'F2F.F16.F32' :    '';
+our $dshift      = $type eq 'h' ?           '1' :   '2';
+our $dsize       = $type eq 'h' ?           '2' :   '4';
+our $vsize       = $type eq 'h' ?          '64' : '128';
+sub dtype  { return $dtype;  }
+sub dsize  { return $dsize;  }
+sub dshift { return $dshift; }
+sub vsize  { return $vsize;  }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero   : 4x<32*36*2*4 + 64 + 0>
+    addr_idx_Y  : 4x<32*36*2*4 + 64 + 4>
+    addr_idx_X  : 4x<32*36*2*4 + 64 + 5>
+    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>
+
+    param_S[0]           : c[0x0][0x140]
+    param_S[1]           : c[0x0][0x144]
+    param_X[0]           : c[0x0][0x148]
+    param_X[1]           : c[0x0][0x14c]
+    param_O[0]           : c[0x0][0x150]
+    param_O[1]           : c[0x0][0x154]
+    param_I[0]           : c[0x0][0x158]
+    param_I[1]           : c[0x0][0x15c]
+    param_F[0]           : c[0x0][0x160]
+    param_F[1]           : c[0x0][0x164]
+    param_alpha          : c[0x0][0x168]
+    param_beta           : c[0x0][0x16c]
+    param_flags          : c[0x0][0x170]
+    param_C              : c[0x0][0x174]
+    param_K              : c[0x0][0x178]
+    param_N              : c[0x0][0x17c]
+    param_Y              : c[0x0][0x180]
+    param_W              : c[0x0][0x184]
+    param_YXN            : c[0x0][0x188]
+    param_XN             : c[0x0][0x18c]
+    param_Y2             : c[0x0][0x190]
+    param_GX             : c[0x0][0x194]
+    param_Xk             : c[0x0][0x198]
+    param_k              : c[0x0][0x19c]
+    param_magic_Xk       : c[0x0][0x1a0]
+    param_shift_Xk       : c[0x0][0x1a4]
+    param_magic_k        : c[0x0][0x1a8]
+    param_shift_k        : c[0x0][0x1ac]
+    param_P              : c[0x0][0x1b0]
+    param_Q              : c[0x0][0x1b4]
+    param_QN             : c[0x0][0x1b8]
+    param_PQN            : c[0x0][0x1bc]
+    param_PQN15          : c[0x0][0x1c0]
+    param_maskN          : c[0x0][0x1c4]
+    param_shiftX         : c[0x0][0x1c8]
+    param_shiftY         : c[0x0][0x1cc]
+    param_superX         : c[0x0][0x1d0]
+    param_superY         : c[0x0][0x1d4]
+    param_pad_x          : c[0x0][0x1d8]
+    param_pad_y          : c[0x0][0x1dc]
+    param_RSK            : c[0x0][0x1e0]
+    param_RSK2p          : c[0x0][0x1e4]
+    param_YXN2p          : c[0x0][0x1e8]
+    param_gridN          : c[0x0][0x1ec]
+    param_gridQN         : c[0x0][0x1f0]
+    param_gridPQN        : c[0x0][0x1f4]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10 : clx<0-3>y0
+     7, 6,15,14 : clx<0-3>y1
+     1, 0, 9, 8 : clx<0-3>y2
+     5, 4,13,12 : clx<0-3>y3
+    19,18,27,26 : clx<0-3>y4
+    23,22,31,30 : clx<0-3>y5
+    17,16,25,24 : clx<0-3>y6
+    21,20,29,28 : clx<0-3>y7
+
+      32-43 : jl0Ix<0-3>, jl0Fy<0-7>
+      44-51 : jl1Ix<0-3>, jl1Fy<4-7>
+      36-39 : jl1Fy<0-3>
+
+      32-43 ~ swapBuff
+
+      88-89 : track<0-1>
+      90-92 : writeS, pred30, pred36
+
+      // Image Transform
+      44-51 ~ ti<0-5>
+
+      52,53,54,56,57,55 : i<0-5>0
+      59,60,61,63,58,62 : i<0-5>1
+      66,67,68,64,65,69 : i<0-5>2
+      73,74,75,71,72,70 : i<0-5>3
+      87,82,83,85,86,84 : i<0-5>4
+      80,81,76,78,79,77 : i<0-5>5
+
+      52,53,54,56,57,55 : TI<0-5>0
+      59,60,61,63,58,62 : TI<0-5>1
+      66,67,68,64,65,69 : TI<0-5>2
+      73,74,75,71,72,70 : TI<0-5>3
+      87,82,83,85,86,84 : TI<0-5>4
+      80,81,76,78,79,77 : TI<0-5>5
+
+      52,53,54,56,57,55 : I<0-5>0
+      59,60,61,63,58,62 : I<0-5>1
+      66,67,68,64,65,69 : I<0-5>2
+      73,74,75,71,72,70 : I<0-5>3
+      87,82,83,85,86,84 : I<0-5>4
+      80,81,76,78,79,77 : I<0-5>5
+
+      // Filter Transform
+      44-47 ~ rcp6, rcp8, rcp12, rcp24
+
+      52,53,54 : f<0-2>0
+      55,56,57 : f<0-2>1
+      58,59,60 : f<0-2>2
+
+      61,62,63 : tf<0-2>0
+      64,65,66 : tf<0-2>1
+      67,68,69 : tf<0-2>2
+
+      70,71,72,73,74,54 : TF<0-5>0
+      76,77,78,79,80,57 : TF<0-5>1
+      82,83,84,85,86,60 : TF<0-5>2
+
+      61,64,48,49,50,51 : ff<0-5>0
+      52,53,55,56,58,59 : ff<0-5>1
+      61,64,48,49,50,51 : ff<0-5>2
+
+      70,71,72,73,74,54 : F<0-5>0
+      62,63,65,66,67,68 : F<0-5>1
+      52,53,55,56,58,59 : F<0-5>2
+      69,75,81,87,76,77 : F<0-5>3
+      61,64,78,79,80,57 : F<0-5>4
+      82,83,84,85,86,60 : F<0-5>5
+
+      32-39 ~ partialC, idx_K, idx_Y, idx_X
+      40-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, idx_Y2, idx_X2, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, tid31, gx, gy, c, kk, offset, sign, idx_N, nn, x<1-5>, mask_x, super_x, super_y, partC
+
+      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
+      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16
+
+
+     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
+     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
+     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
+    35,34,43,42,51,50,59,58 : ccx<0-7>y4
+    39,38,47,46,55,54,63,62 : ccx<0-7>y5
+    33,32,41,40,49,48,57,56 : ccx<0-7>y6
+    37,36,45,44,53,52,61,60 : ccx<0-7>y7
+
+      64-79 : jc0Ix<0-7>, jc0Fy<0-7>
+      80-91 : jc1Ix<4-7>, jc1Fy<0-7>
+      64-67 : jc1Ix<0-3>
+
+      64-86 ~ tid16, tid_1, tid128
+         92 = swapBuf
+
+         87 = tid
+      93-95 ~ C, readFs, readIs
+
+      64-85 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, z<1-3>, mask_q
+      86-95 ~ alpha, one, writeCs, readCs, k, preds, offsetO, bias, bsum_offset
+
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+
+      // t00 80      r00 78
+      // t10 m10     r01 w01
+      // t20 m20     r02 w02
+      // t30 m30     r03 w03
+      // w00 m00     s00 w00
+      // w30 m40     s01 w01
+      // w10 m10     s02 w02
+      // w20 m20     s03 w04
+
+      78 = t0<0-5>, r<0-3>0
+      79 = temp
+
+       3, 2,11,10,19,18 : m<0-5>0
+       1, 9, 0, 8,17,16 : m<0-5>1
+      27,26,25,24,64,65 : m<0-5>2
+                2,11,10 : t10, t20, t30
+                9, 0, 8 : t11, t21, t31
+               26,25,24 : t12, t22, t32
+             3, 2,11,19 : w00, w10, w20, w30
+             1, 9, 0,17 : w01, w11, w21, w31
+            27,26,25,64 : w02, w12, w22, w32
+
+      66,67,68,69,70,71 : m<0-5>3
+      72,73,74,75,76,77 : m<0-5>4
+       8,24,10,65,16,18 : m<0-5>5
+               67,68,69 : t13, t23, t33
+               73,74,75 : t14, t24, t34
+               24,10,65 : t15, t25, t35
+            66,67,68,70 : w03, w13, w23, w33
+            72,73,74,76 : w04, w14, w24, w34
+             8,24,10,16 : w05, w15, w25, w35
+
+                1,27,66 : r01, r02, r03
+                9,26,67 : r11, r12, r13
+                0,25,68 : r21, r22, r23
+               17,64,70 : r31, r32, r33
+             3, 1,27,72 : s00, s01, s02, s03
+             2, 9,26,73 : s10, s11, s12, s13
+            11, 0,25,74 : s20, s21, s22, s23
+            19,17,64,76 : s30, s31, s32, s33
+
+                  80-83 ~ xx<0-3>
+                  78-81 ~ sum<0-3>
+                  82-83 : Sum<0-1>
+                  84-85 : Out<0-1>
+
+             8,10,16,18 ~ b0<0-3>
+            24,65,66,67 ~ b1<0-3>
+            68,69,70,71 ~ b2<0-3>
+            75,77,78,79 ~ b3<0-3>
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV C,   param_C;
+--:-:1:-:2      S2R tid, SR_TID.X;
+01:-:-:-:0      ISETP.GE.AND P0, PT, tid, 128, PT;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+--:-:-:Y:c      LOP.AND partialC, C, 1;
+--:-:-:-:0      IADD C, C, partialC;
+--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;
+
+##############################################################
+LOAD_SETUP:
+
+--:-:1:-:1      S2R idx_YXk, SR_CTAID.X;
+--:-:2:-:1      S2R idx_K,   SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]
+
+--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;
+
+// idx_Y2   = idx_YXk / blk_Xk
+--:-:-:-:1      MOV  magic_Xk, param_magic_Xk;
+--:-:-:-:1      IADD negXk, RZ, -param_Xk;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_Xk, 1, PT;
+01:-:-:-:1  @P3 XMAD     div1, idx_YXk,    magic_Xk,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, idx_YXk,    magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, idx_YXk.H1, magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk,    div1;
+--:-:-:-:1  @P3 IADD3.RS idx_Y2, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  idx_Y2, idx_Y2,  param_shift_Xk;
+--:-:-:-:1 @!P3 SHR.U32  idx_Y2, idx_YXk, param_shift_Xk;
+
+// idx_Xk  = idx_YXk % blk_Xk
+--:-:-:-:1      XMAD.LO2 idx_Xk, negXk, idx_Y2, idx_YXk;
+
+// idx_X2   = idx_Xk / blk_k
+// idx_k   = idx_Xk % blk_k
+--:-:-:-:1      XMAD    idx_X2,  idx_Xk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_X2,  idx_X2, param_shift_k;
+--:-:-:-:1      XMAD    idx_k,   idx_X2, param_k, RZ;
+--:-:-:-:1      IADD    idx_k,  -idx_k,  idx_Xk;
+
+// idx_K = idx_K * blk_k + idx_k
+02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+// gx = x2
+// gy = y2 * 2
+--:-:-:-:1      MOV idx_X, idx_X2;
+--:-:-:-:1      SHL idx_Y, idx_Y2, 1;
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// if y2 != Y2:
+//     gy += (gx&1) ^ ((gx&2)>>1)
+//     gx /= 2
+--:-:-:-:1      ISETP.NE.AND P4, PT, idx_Y2, param_Y2, PT;
+--:-:-:-:1  @P4 LOP.AND x1, idx_X, 1;
+--:-:-:-:1  @P4 BFE.U32 x2, idx_X, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P4 LOP.XOR x1, x1, x2;
+--:-:-:-:1  @P4 IADD idx_Y, idx_Y, x1;
+--:-:-:-:1  @P4 SHR.U32 idx_X, idx_X, 1;
+
+// Scan backwards on odd rows
+// if y2 & 1:
+//     gx = gridX - gx - 1
+--:-:-:-:1      LOP.AND.NZ P5, RZ, idx_Y2, 1;
+--:-:-:-:1  @P5 IADD idx_X, -idx_X,  param_GX;
+--:-:-:-:1  @P5 IADD idx_X,  idx_X, -1;
+
+--:6:-:-:1  @P0 STS [addr_idx_Y], idx_Y;
+--:6:-:-:1  @P0 STS [addr_idx_X], idx_X;
+--:6:-:-:1  @P0 STS [addr_idx_K], idx_K;
+
+// x = gx << shiftX
+// y = gy << shiftY
+--:-:-:-:1      SHL gx, idx_X, param_shiftX;
+--:-:-:-:1      SHL gy, idx_Y, param_shiftY;
+
+// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
+--:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superY;
+--:-:-:-:1      ISCADD gx, super_x,  gx, 2;
+--:-:-:-:1      ISCADD gy, super_y,  gy, 2;
+
+--:-:-:-:1      LOP.AND  tid32_2,  tid,   -32;
+--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid32_2;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD3   readFs, readFs, tid1, tid32_2;
+--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;
+
+// c = (tid & 32) >> 5
+--:-:-:-:1      BFE.U32 c, tid, 0x105; // 1 bits at position 5
+
+// writeS = c*32*36 + tid & 31
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      XMAD writeS, c, 1152, tid31;
+--:-:-:-:1      SHL writeS, writeS, 2;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P1 BRA.U FILTER_SETUP;
+
+##############################################################
+IMAGE_SETUP:
+
+--:-:1:-:1      S2R idx_N, SR_CTAID.Z;
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS [writeS + 4x<32*0>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*1>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*2>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*3>], RZ;
+
+// n = idx_N*32 + tid & maskN
+--:-:-:-:1      LOP.AND nn, tid, param_maskN;
+01:-:-:-:1      ISCADD  nn, idx_N, nn, 5;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P6, PT, nn, param_N, PT;
+
+// Subtract off the padding
+--:-:-:-:1      IADD gx, gx, -param_pad_x;
+--:-:-:-:1      IADD gy, gy, -param_pad_y;
+
+// offset = c*YXN + y0*XN + x0*N + n;
+--:-:-:-:1      XMAD.S16.U16      offset, gx, param_N,   nn;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, gy, param_XN,  offset;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, c,  param_YXN, offset;
+--:-:-:-:1      ISET.LT.AND sign, offset, RZ, PT;
+
+--:-:-:-:1      LEA    track0.CC, offset, param_I[0], [+ dshift() +];
+--:-:-:-:1      IADD.X track1,    sign,   param_I[1];
+
+--:-:-:-:1      IADD x1, gx, 1;
+--:-:-:-:1      IADD x2, gx, 2;
+--:-:-:-:1      IADD x3, gx, 3;
+--:-:-:-:1      IADD x4, gx, 4;
+--:-:-:-:1      IADD x5, gx, 5;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, gx, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, x4, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, x5, param_W, PT;
+--:-:-:-:1      ISETP.GE.AND P0, PT, gx, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, x4, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, x5, RZ, P5;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x3f;
+
+--:-:-:-:1      IADD x1, gy, 1;
+--:-:-:-:1      IADD x2, gy, 2;
+--:-:-:-:1      IADD x3, gy, 3;
+--:-:-:-:1      IADD x4, gy, 4;
+--:-:-:-:1      IADD x5, gy, 5;
+--:-:-:-:1      ISETP.LT.AND P0, PT, gy, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P4, PT, x4, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, x5, param_Y, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, gy, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, x4, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, x5, RZ, P5;
+
+--:-:-:-:1      SEL pred30, mask_x, RZ, P0;
+--:-:-:-:1  @P1 BFI pred30, mask_x, 0x606, pred30;
+--:-:-:-:1  @P2 BFI pred30, mask_x, 0x60c, pred30;
+--:-:-:-:1  @P3 BFI pred30, mask_x, 0x612, pred30;
+--:-:-:-:1  @P4 BFI pred30, mask_x, 0x618, pred30;
+--:-:-:-:1      SEL pred36, mask_x, RZ, P5;
+
+// P6 = c == partialC == 1
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, partialC, P6;
+--:-:-:-:1      XMAD     partC,    partialC, param_YXN,       RZ;
+--:-:-:-:1      XMAD.PSL partialC, partialC, param_YXN.H1, partC;
+--:-:-:-:1      SHL  partialC, partialC, [+ dshift() +];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30;
+20:-:-:-:1 @!P0 MOV i00, RZ;
+--:-:-:-:1 @!P1 MOV i01, RZ;
+--:-:-:-:1 @!P2 MOV i02, RZ;
+--:-:-:-:1 @!P3 MOV i03, RZ;
+--:-:-:-:1 @!P4 MOV i04, RZ;
+--:-:-:-:1 @!P5 MOV i05, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i04, [track + [+ dsize() +]x<0*$W*$N + 4*$N>];
+--:-:1:-:1  @P5 LDG.E.CI.[+ dtype() +] i05, [track + [+ dsize() +]x<0*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1 @!P0 MOV i10, RZ;
+--:-:-:-:1 @!P1 MOV i11, RZ;
+--:-:-:-:1 @!P2 MOV i12, RZ;
+--:-:-:-:1 @!P3 MOV i13, RZ;
+--:-:-:-:1 @!P4 MOV i14, RZ;
+--:-:-:-:1 @!P5 MOV i15, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i14, [track + [+ dsize() +]x<1*$W*$N + 4*$N>];
+--:-:2:-:1  @P5 LDG.E.CI.[+ dtype() +] i15, [track + [+ dsize() +]x<1*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1 @!P0 MOV i20, RZ;
+--:-:-:-:1 @!P1 MOV i21, RZ;
+--:-:-:-:1 @!P2 MOV i22, RZ;
+--:-:-:-:1 @!P3 MOV i23, RZ;
+--:-:-:-:1 @!P4 MOV i24, RZ;
+--:-:-:-:1 @!P5 MOV i25, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i24, [track + [+ dsize() +]x<2*$W*$N + 4*$N>];
+--:-:3:-:1  @P5 LDG.E.CI.[+ dtype() +] i25, [track + [+ dsize() +]x<2*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1 @!P0 MOV i30, RZ;
+--:-:-:-:1 @!P1 MOV i31, RZ;
+--:-:-:-:1 @!P2 MOV i32, RZ;
+--:-:-:-:1 @!P3 MOV i33, RZ;
+--:-:-:-:1 @!P4 MOV i34, RZ;
+--:-:-:-:1 @!P5 MOV i35, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i34, [track + [+ dsize() +]x<3*$W*$N + 4*$N>];
+--:-:4:-:1  @P5 LDG.E.CI.[+ dtype() +] i35, [track + [+ dsize() +]x<3*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.L.U64 pred30, pred30, 24, pred30;
+--:-:-:-:1 @!P0 MOV i40, RZ;
+--:-:-:-:1 @!P1 MOV i41, RZ;
+--:-:-:-:1 @!P2 MOV i42, RZ;
+--:-:-:-:1 @!P3 MOV i43, RZ;
+--:-:-:-:1 @!P4 MOV i44, RZ;
+--:-:-:-:1 @!P5 MOV i45, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i40, [track + [+ dsize() +]x<4*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i41, [track + [+ dsize() +]x<4*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i42, [track + [+ dsize() +]x<4*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i43, [track + [+ dsize() +]x<4*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i44, [track + [+ dsize() +]x<4*$W*$N + 4*$N>];
+--:-:5:-:1  @P5 LDG.E.CI.[+ dtype() +] i45, [track + [+ dsize() +]x<4*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred36, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P0 MOV i50, RZ;
+--:-:-:-:1 @!P1 MOV i51, RZ;
+--:-:-:-:1 @!P2 MOV i52, RZ;
+--:-:-:-:1 @!P3 MOV i53, RZ;
+--:-:-:-:1 @!P4 MOV i54, RZ;
+--:-:-:-:1 @!P5 MOV i55, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i50, [track + [+ dsize() +]x<5*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i51, [track + [+ dsize() +]x<5*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i52, [track + [+ dsize() +]x<5*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i53, [track + [+ dsize() +]x<5*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i54, [track + [+ dsize() +]x<5*$W*$N + 4*$N>];
+--:-:6:-:1  @P5 LDG.E.CI.[+ dtype() +] i55, [track + [+ dsize() +]x<5*$W*$N + 5*$N>];
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+3f:-:-:-:5      IADD   track0.CC, track0, -partialC;
+--:-:-:-:1      IADD   writeS,    writeS, 4x<32*36*2*2>;
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+
+##############################################################
+FILTER_SETUP:
+
+<SCHEDULE_BLOCK>
+// writeS += 32*36*2*4
+--:-:-:-:1      IADD writeS, writeS, 4x<32*36*2>;
+
+--:-:-:-:1      MOV swapBuf, 4x<32*36*2*2>;
+
+// P6 = c == partialC == 1
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, partialC, P6;
+--:-:-:-:1      XMAD     partC,    partialC, param_RSK, RZ;
+--:-:-:-:1      XMAD.PSL partialC, partialC, param_RSK.H1, partC;
+--:-:-:-:1      SHL  partialC, partialC, [+ dshift() +];
+
+// k = idx_K*32 + tid & 31
+--:-:-:-:1      ISCADD  kk, idx_K, tid31,  5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, !P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, kk, param_K, PT;
+
+// a0 = c*RSK + k
+--:-:-:-:1      XMAD.LO2C offset, c, param_RSK, kk;
+--:-:-:-:1      LEA      track0.CC, offset, param_F[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X track1,    offset, param_F[1], RZ, [+ dshift() +];
+
+--:-:-:-:1      STS [writeS + 4x<32*0>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*1>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*2>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*3>], RZ;
+
+20:-:-:-:1 @!P6 MOV f00, RZ;
+--:-:-:-:1 @!P6 MOV f01, RZ;
+--:-:-:-:1 @!P6 MOV f02, RZ;
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f00, [track + [+ dsize() +]x<0*3*$K + 0*$K>];
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f01, [track + [+ dsize() +]x<0*3*$K + 1*$K>];
+--:-:1:-:1  @P6 LDG.E.CI.[+ dtype() +] f02, [track + [+ dsize() +]x<0*3*$K + 2*$K>];
+
+--:-:-:-:1 @!P6 MOV f10, RZ;
+--:-:-:-:1 @!P6 MOV f11, RZ;
+--:-:-:-:1 @!P6 MOV f12, RZ;
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f10, [track + [+ dsize() +]x<1*3*$K + 0*$K>];
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f11, [track + [+ dsize() +]x<1*3*$K + 1*$K>];
+--:-:2:-:1  @P6 LDG.E.CI.[+ dtype() +] f12, [track + [+ dsize() +]x<1*3*$K + 2*$K>];
+
+--:-:-:-:1 @!P6 MOV f20, RZ;
+--:-:-:-:1 @!P6 MOV f21, RZ;
+--:-:-:-:1 @!P6 MOV f22, RZ;
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f20, [track + [+ dsize() +]x<2*3*$K + 0*$K>];
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f21, [track + [+ dsize() +]x<2*3*$K + 1*$K>];
+--:5:3:-:1  @P6 LDG.E.CI.[+ dtype() +] f22, [track + [+ dsize() +]x<2*3*$K + 2*$K>];
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+10:-:-:-:4      IADD   track0.CC, track0, -partialC;
+--:-:-:-:1      IADD   writeS, writeS, swapBuf;
+--:-:-:-:1      IADD   swapBuf, RZ, -swapBuf;
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:5      BRA.U FILTER_LOOP;
+
+##############################################################
+
+COMPUTE_SETUP:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV swapBuf, 4x<32*36*2*2>;
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+--:-:-:-:1      IADD tid128, tid, -128;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,   1;
+
+--:-:-:-:1      BFE.U32  readIs, tid128, 0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
+--:-:-:-:1      ISCADD   readIs, readIs, 4x<32*4 + 32*36*2*2>, 4;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
+--:-:-:-:1      LOP.AND  readFs, tid128, 8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      IADD3    readFs, readFs, tid16, tid_1;
+--:-:-:-:0      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2*3>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+// Let Load loop run once to transform initial load and store to shared.
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>];
+--:-:1:-:2      LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+COMPUTE_LOOP:
+[+
+    my %insert = (
+
+        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -2;\n",
+
+        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        my $odd    = $j;
+        my $nOdd   = 1 - $j;
+        my $rsPred = $j == 1 ? '@P0' : '   ';
+        my $bar    = $j == 0 ? '2' : '-';
+
+        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;
+
+        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;
+
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            #$stall = '4' if $stall && $c % 2 == 0 && $j == 0 && $c > 16;
+
+            my $yield  = $c % 5 == 0 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+IMAGE_LOOP:
+
+[+
+    our $convert_in; return $convert_in ? q{
+01:-:-:-:1      F2F.F32.F16 i00, i00;
+--:-:-:-:1      F2F.F32.F16 i01, i01;
+--:-:-:-:1      F2F.F32.F16 i02, i02;
+--:-:-:-:1      F2F.F32.F16 i03, i03;
+--:-:-:-:1      F2F.F32.F16 i04, i04;
+--:-:1:-:1      F2F.F32.F16 i05, i05;
+
+02:-:-:-:1      F2F.F32.F16 i10, i10;
+--:-:-:-:1      F2F.F32.F16 i11, i11;
+--:-:-:-:1      F2F.F32.F16 i12, i12;
+--:-:-:-:1      F2F.F32.F16 i13, i13;
+--:-:-:-:1      F2F.F32.F16 i14, i14;
+--:-:2:-:1      F2F.F32.F16 i15, i15;
+
+04:-:-:-:1      F2F.F32.F16 i20, i20;
+--:-:-:-:1      F2F.F32.F16 i21, i21;
+--:-:-:-:1      F2F.F32.F16 i22, i22;
+--:-:-:-:1      F2F.F32.F16 i23, i23;
+--:-:-:-:1      F2F.F32.F16 i24, i24;
+--:-:3:-:1      F2F.F32.F16 i25, i25;
+
+08:-:-:-:1      F2F.F32.F16 i30, i30;
+--:-:-:-:1      F2F.F32.F16 i31, i31;
+--:-:-:-:1      F2F.F32.F16 i32, i32;
+--:-:-:-:1      F2F.F32.F16 i33, i33;
+--:-:-:-:1      F2F.F32.F16 i34, i34;
+--:-:4:-:1      F2F.F32.F16 i35, i35;
+
+10:-:-:-:1      F2F.F32.F16 i40, i40;
+--:-:-:-:1      F2F.F32.F16 i41, i41;
+--:-:-:-:1      F2F.F32.F16 i42, i42;
+--:-:-:-:1      F2F.F32.F16 i43, i43;
+--:-:-:-:1      F2F.F32.F16 i44, i44;
+--:-:5:-:1      F2F.F32.F16 i45, i45;
+
+20:-:-:-:1      F2F.F32.F16 i50, i50;
+--:-:-:-:1      F2F.F32.F16 i51, i51;
+--:-:-:-:1      F2F.F32.F16 i52, i52;
+--:-:-:-:1      F2F.F32.F16 i53, i53;
+--:-:-:-:1      F2F.F32.F16 i54, i54;
+--:-:6:-:2      F2F.F32.F16 i55, i55;
+        } : '';
++]
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        my $w = $i == 0 ? '3f' : '--';
+        $out .= qq{
+$w:-:-:-:1      FFMA ti4,   i2$i, -2.640625,   i4$i;
+--:-:-:-:1      FFMA ti5,   i3$i, -2.640625,   i5$i;
+--:-:-:-:1      FFMA ti0,   i2$i, -2.25,       i4$i;
+--:-:-:-:1      FFMA ti1,   i1$i, -2.25,       i3$i;
+--:-:-:-:1      FFMA ti2,   i2$i, -0.390625,   i4$i;
+--:-:-:-:1      FFMA ti3,   i1$i, -0.390625,   i3$i;
+--:-:-:-:1      FFMA TI0$i, i0$i,  0.87890625, ti4;
+--:-:-:-:1      FFMA TI5$i, i1$i,  0.87890625, ti5;
+--:-:-:-:1      FFMA TI1$i, ti1,   0.625,      ti0;
+--:-:-:-:1      FFMA TI2$i, ti1,  -0.625,      ti0;
+--:-:-:-:1      FFMA TI3$i, ti3,   1.5,        ti2;
+--:-:-:-:1      FFMA TI4$i, ti3,  -1.5,        ti2;
+        };
+    }
+    return $out;
++]
+
+--:-:-:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;
+
+--:-:-:-:1      IADD   track0.CC, track0, param_YXN2p;
+--:-:-:-:1      IADD.X track1,    track1, RZ;
+
+//--:-:-:-:1      LOP32I.AND pred30, pred30, 0xffffff;
+--:-:-:-:1 @!P0 BFI pred36, RZ, 0x600, pred36;
+--:-:-:-:1 @!P0 MOV pred30, RZ;
+
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        $out .= qq{
+--:-:-:-:1      FFMA ti4,    TI${i}2, -2.640625,   TI${i}4;
+--:-:-:-:1      FFMA ti5,    TI${i}3, -2.640625,   TI${i}5;
+--:-:-:-:1      FFMA ti0,    TI${i}2, -2.25,       TI${i}4;
+--:-:-:-:1      FFMA ti1,    TI${i}1, -2.25,       TI${i}3;
+--:-:-:-:1      FFMA ti2,    TI${i}2, -0.390625,   TI${i}4;
+--:-:-:-:1      FFMA ti3,    TI${i}1, -0.390625,   TI${i}3;
+--:-:-:-:1      FFMA I${i}0, TI${i}0,  0.87890625, ti4;
+--:-:-:-:1      FFMA I${i}5, TI${i}1,  0.87890625, ti5;
+--:-:-:-:1      FFMA I${i}1, ti1,      0.625,      ti0;
+--:-:-:-:1      FFMA I${i}2, ti1,     -0.625,      ti0;
+--:-:-:-:1      FFMA I${i}3, ti3,      1.5,        ti2;
+--:-:-:-:1      FFMA I${i}4, ti3,     -1.5,        ti2;
+        };
+    }
+    return $out;
++]
+<ORDERED>
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 3)>], I03;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 4)>], I04;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 0)>], I00;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 5)>], I05;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 1)>], I01;
+--:1:-:-:1      STS [writeS + 4x<32*(0*6 + 2)>], I02;
+
+
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 0)>], I10;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 5)>], I15;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 3)>], I13;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 1)>], I11;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 2)>], I12;
+--:2:-:-:1      STS [writeS + 4x<32*(1*6 + 4)>], I14;
+
+01:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i04, [track + [+ dsize() +]x<0*$W*$N + 4*$N>];
+--:-:1:-:1  @P5 LDG.E.CI.[+ dtype() +] i05, [track + [+ dsize() +]x<0*$W*$N + 5*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i00, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i01, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i02, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i03, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i04, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i05, RZ;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 0)>], I20;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 5)>], I25;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 1)>], I21;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 2)>], I22;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 3)>], I23;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:3:-:-:1      STS [writeS + 4x<32*(2*6 + 4)>], I24;
+
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 0)>], I30;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 5)>], I35;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 1)>], I31;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 2)>], I32;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 3)>], I33;
+--:4:-:-:1      STS [writeS + 4x<32*(3*6 + 4)>], I34;
+
+02:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i14, [track + [+ dsize() +]x<1*$W*$N + 4*$N>];
+--:-:2:-:1  @P5 LDG.E.CI.[+ dtype() +] i15, [track + [+ dsize() +]x<1*$W*$N + 5*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i10, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i11, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i12, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i13, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i14, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i15, RZ;
+
+--:-:-:-:5      R2P PR, pred30, 0x3f; // FORCE
+
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 0)>], I40;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 5)>], I45;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 1)>], I41;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 2)>], I42;
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 3)>], I43;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 4)>], I44;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+
+--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];
+--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];
+
+04:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i24, [track + [+ dsize() +]x<2*$W*$N + 4*$N>];
+--:-:3:-:1  @P5 LDG.E.CI.[+ dtype() +] i25, [track + [+ dsize() +]x<2*$W*$N + 5*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i20, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i21, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i22, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i23, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i24, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i25, RZ;
+--:-:-:-:6      R2P PR, pred30, 0x3f; // FORCE
+
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 0)>], I50;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 5)>], I55;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 1)>], I51;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 2)>], I52;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 3)>], I53;
+--:6:-:-:1      STS [writeS + 4x<32*(5*6 + 4)>], I54;
+
+08:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i34, [track + [+ dsize() +]x<3*$W*$N + 4*$N>];
+--:-:4:-:1  @P5 LDG.E.CI.[+ dtype() +] i35, [track + [+ dsize() +]x<3*$W*$N + 5*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i30, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i31, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i32, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i33, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i34, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i35, RZ;
+--:-:-:-:c      R2P PR, pred30, 0x3f; // FORCE
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i40, [track + [+ dsize() +]x<4*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i41, [track + [+ dsize() +]x<4*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i42, [track + [+ dsize() +]x<4*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i43, [track + [+ dsize() +]x<4*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i44, [track + [+ dsize() +]x<4*$W*$N + 4*$N>];
+--:-:5:-:1  @P5 LDG.E.CI.[+ dtype() +] i45, [track + [+ dsize() +]x<4*$W*$N + 5*$N>];
+--:-:-:-:1      SHF.L.U64 pred30, pred30, 24, pred30;
+
+--:-:-:-:1 @!P0 I2I.U32.U32 i40, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i41, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i42, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i43, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i44, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i45, RZ;
+--:-:-:-:a      R2P PR, pred36, 0x3f; // FORCE
+
+20:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i50, [track + [+ dsize() +]x<5*$W*$N + 0*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i50, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i51, [track + [+ dsize() +]x<5*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i52, [track + [+ dsize() +]x<5*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i53, [track + [+ dsize() +]x<5*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i54, [track + [+ dsize() +]x<5*$W*$N + 4*$N>];
+--:-:6:-:1  @P5 LDG.E.CI.[+ dtype() +] i55, [track + [+ dsize() +]x<5*$W*$N + 5*$N>];
+--:-:-:-:1 @!P1 I2I.U32.U32 i51, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i52, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i53, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i54, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i55, RZ;
+</ORDERED>
+
+<ORDERED>
+[+
+    our ($vsize, $dsize, $convert_in);
+    my %insert = (
+        j0c15 => "--:-:5:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 && $j == 1 ? "10" : '--';
+
+            my $ctrl   = "$wait:-:-:-:1";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+</ORDERED>
+--:-:-:-:1      LOP.AND.Z P0, RZ, pred36, 0x100;
+--:-:-:-:1      LOP.XOR pred36, pred36, 0x100;
+
+--:-:-:-:1      ISETP.GT.AND P1, PT, C, 0, PT;
+
+--:-:-:-:1  @P0 MOV32I swapBuff,  4x<32*36*2*2>;
+
+</SCHEDULE_BLOCK>
+--:-:-:-:1 @!P0 MOV32I swapBuff, -4x<32*36*2*2>;
+--:-:-:-:0      IADD C, C, -2;
+--:-:-:Y:5      BAR.SYNC 0;
+--:-:-:-:1      IADD readFs, readFs,  swapBuff;
+--:-:-:-:1      IADD readIs, readIs,  swapBuff;
+--:-:-:-:1      IADD writeS, writeS, -swapBuff;
+--:-:-:Y:5  @P1 BRA.U IMAGE_LOOP;
+--:-:-:Y:5      BRA.U LOAD_FINISH;
+
+FILTER_LOOP:
+
+[+
+    our $convert_in; return $convert_in ? q{
+01:-:-:-:1      F2F.F32.F16 f00, f00;
+--:-:-:-:1      F2F.F32.F16 f01, f01;
+--:-:1:-:1      F2F.F32.F16 f02, f02;
+
+02:-:-:-:1      F2F.F32.F16 f10, f10;
+--:-:-:-:1      F2F.F32.F16 f11, f11;
+--:-:2:-:1      F2F.F32.F16 f12, f12;
+
+04:-:-:-:1      F2F.F32.F16 f20, f20;
+--:-:-:-:1      F2F.F32.F16 f21, f21;
+--:-:3:-:1      F2F.F32.F16 f22, f22;
+        } : '';
++]
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV32I rcp6,  0.688403361344538;
+--:-:-:-:1      MOV32I rcp8,  0.430252100840336;
+--:-:-:-:1      MOV32I rcp24, 0.119514472455649;
+--:-:-:-:1      MOV32I rcp12, 0.179271708683473;
+07:-:-:-:1      FMUL32I tf00, f20, 0.26890756302521;
+--:-:-:-:1      FMUL32I tf01, f21, 0.26890756302521;
+--:-:-:-:1      FMUL32I tf02, f22, 0.26890756302521;
+--:-:-:-:1      FFMA tf10, f00, -rcp6, -tf00;
+--:-:-:-:1      FFMA tf20, f00,  rcp24, tf00;
+--:-:-:-:1      FFMA tf11, f01, -rcp6, -tf01;
+--:-:-:-:1      FFMA tf21, f01,  rcp24, tf01;
+--:-:-:-:1      FFMA tf12, f02, -rcp6, -tf02;
+--:-:-:-:1      FFMA tf22, f02,  rcp24, tf02;
+
+--:-:-:-:1      FMUL32I TF00, f00,  1.13777777777778;
+--:-:-:-:1      FFMA TF10, f10, -rcp8,  tf10;
+--:-:-:-:1      FFMA TF20, f10,  rcp8,  tf10;
+--:-:-:-:1      FFMA TF30, f10,  rcp12, tf20;
+--:-:-:-:1      FFMA TF40, f10, -rcp12, tf20;
+//--:-:-:-:1      MOV  TF50, f20;
+
+--:-:-:-:1      FMUL32I TF02, f02,  1.13777777777778;
+--:-:-:-:1      FFMA TF12, f12, -rcp8,  tf12;
+--:-:-:-:1      FFMA TF22, f12,  rcp8,  tf12;
+--:-:-:-:1      FFMA TF32, f12,  rcp12, tf22;
+--:-:-:-:1      FFMA TF42, f12, -rcp12, tf22;
+//--:-:-:-:1      MOV  TF52, f22;
+
+--:-:-:-:1      FMUL32I TF01, f01,  1.13777777777778;
+--:-:-:-:1      FFMA TF11, f11, -rcp8,  tf11;
+--:-:-:-:1      FFMA TF21, f11,  rcp8,  tf11;
+--:-:-:-:1      FFMA TF31, f11,  rcp12, tf21;
+--:-:-:-:1      FFMA TF41, f11, -rcp12, tf21;
+//--:-:-:-:1      MOV  TF51, f21;
+
+--:-:-:-:1      FMUL32I ff00, TF02, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff10, TF12, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff20, TF22, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff30, TF32, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff40, TF42, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff50, TF52, 0.26890756302521;
+--:-:-:-:1      FFMA ff01, TF00, -rcp6, -ff00;
+--:-:-:-:1      FFMA ff02, TF00,  rcp24, ff00;
+--:-:-:-:1      FFMA ff11, TF10, -rcp6, -ff10;
+--:-:-:-:1      FFMA ff12, TF10,  rcp24, ff10;
+--:-:-:-:1      FFMA ff21, TF20, -rcp6, -ff20;
+--:-:-:-:1      FFMA ff22, TF20,  rcp24, ff20;
+--:-:-:-:1      FFMA ff31, TF30, -rcp6, -ff30;
+--:-:-:-:1      FFMA ff32, TF30,  rcp24, ff30;
+--:-:-:-:1      FFMA ff41, TF40, -rcp6, -ff40;
+--:-:-:-:1      FFMA ff42, TF40,  rcp24, ff40;
+--:-:-:-:1      FFMA ff51, TF50, -rcp6, -ff50;
+--:-:-:-:1      FFMA ff52, TF50,  rcp24, ff50;
+
+--:-:-:-:1      FMUL32I F00, TF00,  1.13777777777778;
+--:-:-:-:1      FFMA F01, TF01, -rcp8,  ff01;
+--:-:-:-:1      FFMA F02, TF01,  rcp8,  ff01;
+--:-:-:-:1      FFMA F03, TF01,  rcp12, ff02;
+--:-:-:-:1      FFMA F04, TF01, -rcp12, ff02;
+//--:-:-:-:1      MOV  F05, TF02;
+
+--:-:-:-:1      FMUL32I F10, TF10,  1.13777777777778;
+--:-:-:-:1      FFMA F11, TF11, -rcp8,  ff11;
+--:-:-:-:1      FFMA F12, TF11,  rcp8,  ff11;
+--:-:-:-:1      FFMA F13, TF11,  rcp12, ff12;
+--:-:-:-:1      FFMA F14, TF11, -rcp12, ff12;
+//--:-:-:-:1      MOV  F15, TF12;
+
+--:-:-:-:1      FMUL32I F20, TF20,  1.13777777777778;
+--:-:-:-:1      FFMA F21, TF21, -rcp8,  ff21;
+--:-:-:-:1      FFMA F22, TF21,  rcp8,  ff21;
+--:-:-:-:1      FFMA F23, TF21,  rcp12, ff22;
+--:-:-:-:1      FFMA F24, TF21, -rcp12, ff22;
+//--:-:-:-:1      MOV  F25, TF22;
+
+--:-:-:-:1      FMUL32I F30, TF30,  1.13777777777778;
+--:-:-:-:1      FFMA F31, TF31, -rcp8,  ff31;
+--:-:-:-:1      FFMA F32, TF31,  rcp8,  ff31;
+--:-:-:-:1      FFMA F33, TF31,  rcp12, ff32;
+--:-:-:-:1      FFMA F34, TF31, -rcp12, ff32;
+//--:-:-:-:1      MOV  F35, TF32;
+
+--:-:-:-:1      FMUL32I F40, TF40,  1.13777777777778;
+--:-:-:-:1      FFMA F41, TF41, -rcp8,  ff41;
+--:-:-:-:1      FFMA F42, TF41,  rcp8,  ff41;
+--:-:-:-:1      FFMA F43, TF41,  rcp12, ff42;
+--:-:-:-:1      FFMA F44, TF41, -rcp12, ff42;
+//--:-:-:-:1      MOV  F45, TF42;
+
+--:-:-:-:1      FMUL32I F50, TF50,  1.13777777777778;
+--:-:-:-:1      FFMA F51, TF51, -rcp8,  ff51;
+--:-:-:-:1      FFMA F52, TF51,  rcp8,  ff51;
+--:-:-:-:1      FFMA F53, TF51,  rcp12, ff52;
+--:-:-:-:1      FFMA F54, TF51, -rcp12, ff52;
+//--:-:-:-:1      MOV  F55, TF52;
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, P2;
+--:-:-:-:1      ISETP.GT.AND P1, PT, C, 0, PT;
+--:-:-:-:1      IADD C, C, -2;
+
+--:-:-:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:6:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 5)>], F55;
+
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 0)>], F00;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 1)>], F01;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 2)>], F02;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 3)>], F03;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 4)>], F04;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 5)>], F05;
+
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 0)>], F10;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 1)>], F11;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 2)>], F12;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 3)>], F13;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 4)>], F14;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 5)>], F15;
+
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 0)>], F20;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 1)>], F21;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 2)>], F22;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 3)>], F23;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 4)>], F24;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 5)>], F25;
+
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 0)>], F30;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 1)>], F31;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 2)>], F32;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 3)>], F33;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 4)>], F34;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 5)>], F35;
+
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 0)>], F40;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 1)>], F41;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 2)>], F42;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 3)>], F43;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 4)>], F44;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 5)>], F45;
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+20:-:-:-:1      IADD   track0.CC, track0, param_RSK2p;
+--:-:-:-:1      IADD.X track1,    track1, RZ;
+<ORDERED>
+[+
+    our ($dtype, $dsize, $SK, $K);
+    my %insert = (
+        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n",
+        j0c1  => "--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n",
+        j0c15 => "--:-:5:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+
+        j0c5  => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 0)>], F50;\n",
+        j0c7  => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 1)>], F51;\n",
+        j0c9  => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 2)>], F52;\n",
+        j0c11 => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 3)>], F53;\n",
+        j0c13 => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 4)>], F54;\n",
+
+        j1c1  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];\n",
+        j1c2  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];\n",
+        j1c3  => "--:-:1:-:1  \@P0 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];\n",
+
+        j1c4  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];\n",
+        j1c5  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];\n",
+        j1c6  => "--:-:2:-:1  \@P0 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];\n",
+
+        j1c7  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];\n",
+        j1c8  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];\n",
+        j1c9  => "--:-:3:-:1  \@P0 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 && $j == 1 ? "10" : '--';
+
+            my $ctrl   = "$wait:-:-:-:1";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+</ORDERED>
+
+</SCHEDULE_BLOCK>
+--:-:-:-:1      IADD readFs, readFs, -swapBuf;
+--:-:-:-:1      IADD readIs, readIs, -swapBuf;
+--:-:-:-:0      IADD writeS, writeS,  swapBuf;
+--:-:-:Y:5      BAR.SYNC 0;
+--:-:-:-:0      IADD swapBuf, RZ,    -swapBuf;
+--:-:-:Y:5  @P1 BRA.U FILTER_LOOP;
+
+
+LOAD_FINISH:
+
+[-
+    our $trans1 = "0.244140625";
+    our $trans2 = "0.625";
+    our $trans3 = "0.390625";
+-]
+
+<INCLUDE file="xconv_winograd_4x4_3x3_32x32_common.sass"/>
diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_X.sass b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_X.sass
new file mode 100644
index 0000000..15a0f0b
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_X.sass
@@ -0,0 +1,687 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $type;
+our $dtype       = $type eq 'h' ?         'U16' :  '32';
+our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' :    '';
+our $convert_out = $type eq 'h' ? 'F2F.F16.F32' :    '';
+our $dshift      = $type eq 'h' ?           '1' :   '2';
+our $dsize       = $type eq 'h' ?           '2' :   '4';
+our $vsize       = $type eq 'h' ?          '64' : '128';
+sub dtype  { return $dtype;  }
+sub dsize  { return $dsize;  }
+sub dshift { return $dshift; }
+sub vsize  { return $vsize;  }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero   : 4x<32*36*2*4 + 64 + 0>
+    addr_idx_Y  : 4x<32*36*2*4 + 64 + 4>
+    addr_idx_X  : 4x<32*36*2*4 + 64 + 5>
+    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>
+
+    param_S[0]           : c[0x0][0x140]
+    param_S[1]           : c[0x0][0x144]
+    param_X[0]           : c[0x0][0x148]
+    param_X[1]           : c[0x0][0x14c]
+    param_O[0]           : c[0x0][0x150]
+    param_O[1]           : c[0x0][0x154]
+    param_I[0]           : c[0x0][0x158]
+    param_I[1]           : c[0x0][0x15c]
+    param_F[0]           : c[0x0][0x160]
+    param_F[1]           : c[0x0][0x164]
+    param_alpha          : c[0x0][0x168]
+    param_beta           : c[0x0][0x16c]
+    param_flags          : c[0x0][0x170]
+    param_C              : c[0x0][0x174]
+    param_K              : c[0x0][0x178]
+    param_N              : c[0x0][0x17c]
+    param_Xk             : c[0x0][0x180]
+    param_k              : c[0x0][0x184]
+    param_magic_Xk       : c[0x0][0x188]
+    param_shift_Xk       : c[0x0][0x18c]
+    param_magic_k        : c[0x0][0x190]
+    param_shift_k        : c[0x0][0x194]
+    param_C_1152         : c[0x0][0x198]
+    param_GXS_C_1152     : c[0x0][0x19c]
+    param_GYS_GXS_C_1152 : c[0x0][0x1a0]
+    param_P              : c[0x0][0x1a4]
+    param_Q              : c[0x0][0x1a8]
+    param_QN             : c[0x0][0x1ac]
+    param_PQN            : c[0x0][0x1b0]
+    param_PQN15          : c[0x0][0x1b4]
+    param_maskN          : c[0x0][0x1b8]
+    param_shiftX         : c[0x0][0x1bc]
+    param_shiftY         : c[0x0][0x1c0]
+    param_superX         : c[0x0][0x1c4]
+    param_superY         : c[0x0][0x1c8]
+    param_gridN          : c[0x0][0x1cc]
+    param_gridQN         : c[0x0][0x1d0]
+    param_gridPQN        : c[0x0][0x1d4]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10 : clx<0-3>y0
+     7, 6,15,14 : clx<0-3>y1
+     1, 0, 9, 8 : clx<0-3>y2
+     5, 4,13,12 : clx<0-3>y3
+    19,18,27,26 : clx<0-3>y4
+    23,22,31,30 : clx<0-3>y5
+    17,16,25,24 : clx<0-3>y6
+    21,20,29,28 : clx<0-3>y7
+
+      32-43 : jl0Ix<0-3>, jl0Fy<0-7>
+      44-51 : jl1Ix<0-3>, jl1Fy<4-7>
+      36-39 : jl1Fy<0-3>
+
+      52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3>
+      88-89 : track<0-1>
+      90-91 ~ writeS
+
+      32-39 ~ partialC, idx_K, idx_Y, idx_X
+      40-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, tid31, c, offset, idx_N
+
+      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
+      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16
+
+
+     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
+     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
+     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
+    35,34,43,42,51,50,59,58 : ccx<0-7>y4
+    39,38,47,46,55,54,63,62 : ccx<0-7>y5
+    33,32,41,40,49,48,57,56 : ccx<0-7>y6
+    37,36,45,44,53,52,61,60 : ccx<0-7>y7
+
+      64-79 : jc0Ix<0-7>, jc0Fy<0-7>
+      80-91 : jc1Ix<4-7>, jc1Fy<0-7>
+      64-67 : jc1Ix<0-3>
+
+      64-86 ~ tid16, tid_1, tid128
+
+         87 = tid
+      92-95 ~ C, swapBuf, readFs, readIs
+
+      64-85 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, z<1-3>, mask_q
+      86-95 ~ alpha, one, writeCs, readCs, k, preds, offsetO, bias, bsum_offset
+
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+
+      // t00 80      r00 78
+      // t10 m10     r01 w01
+      // t20 m20     r02 w02
+      // t30 m30     r03 w03
+      // w00 m00     s00 w00
+      // w30 m40     s01 w01
+      // w10 m10     s02 w02
+      // w20 m20     s03 w04
+
+      78 = t0<0-5>, r<0-3>0
+      79 = temp
+
+       3, 2,11,10,19,18 : m<0-5>0
+       1, 9, 0, 8,17,16 : m<0-5>1
+      27,26,25,24,64,65 : m<0-5>2
+                2,11,10 : t10, t20, t30
+                9, 0, 8 : t11, t21, t31
+               26,25,24 : t12, t22, t32
+             3, 2,11,19 : w00, w10, w20, w30
+             1, 9, 0,17 : w01, w11, w21, w31
+            27,26,25,64 : w02, w12, w22, w32
+
+      66,67,68,69,70,71 : m<0-5>3
+      72,73,74,75,76,77 : m<0-5>4
+       8,24,10,65,16,18 : m<0-5>5
+               67,68,69 : t13, t23, t33
+               73,74,75 : t14, t24, t34
+               24,10,65 : t15, t25, t35
+            66,67,68,70 : w03, w13, w23, w33
+            72,73,74,76 : w04, w14, w24, w34
+             8,24,10,16 : w05, w15, w25, w35
+
+                1,27,66 : r01, r02, r03
+                9,26,67 : r11, r12, r13
+                0,25,68 : r21, r22, r23
+               17,64,70 : r31, r32, r33
+             3, 1,27,72 : s00, s01, s02, s03
+             2, 9,26,73 : s10, s11, s12, s13
+            11, 0,25,74 : s20, s21, s22, s23
+            19,17,64,76 : s30, s31, s32, s33
+
+                  80-83 ~ xx<0-3>
+                  78-81 ~ sum<0-3>
+                  82-83 : Sum<0-1>
+                  84-85 : Out<0-1>
+
+             8,10,16,18 ~ b0<0-3>
+            24,65,66,67 ~ b1<0-3>
+            68,69,70,71 ~ b2<0-3>
+            75,77,78,79 ~ b3<0-3>
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV C,   param_C;
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:-:-:1      MOV swapBuf, 4x<32*36*2*2>;
+01:-:-:-:0      ISETP.GE.AND P0, PT, tid, 128, PT;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+--:-:-:Y:c      LOP.AND partialC, C, 1;
+--:-:-:-:0      IADD C, C, partialC;
+--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;
+
+##############################################################
+LOAD_SETUP:
+
+--:-:1:-:1      S2R idx_YXk, SR_CTAID.X;
+--:-:2:-:1      S2R idx_K,   SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]
+
+--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;
+
+// idx_Y   = idx_YXk / blk_Xk
+--:-:-:-:1      MOV  magic_Xk, param_magic_Xk;
+--:-:-:-:1      IADD negXk, RZ, -param_Xk;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_Xk, 1, PT;
+01:-:-:-:1  @P3 XMAD     div1, idx_YXk,    magic_Xk,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, idx_YXk,    magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, idx_YXk.H1, magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk,    div1;
+--:-:-:-:1  @P3 IADD3.RS idx_Y, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  idx_Y, idx_Y,   param_shift_Xk;
+--:-:-:-:1 @!P3 SHR.U32  idx_Y, idx_YXk, param_shift_Xk;
+
+// idx_Xk  = idx_YXk % blk_Xk
+--:-:-:-:1      XMAD.LO2 idx_Xk, negXk, idx_Y, idx_YXk;
+
+// idx_X   = idx_Xk / blk_k
+// idx_k   = idx_Xk % blk_k
+--:-:-:-:1      XMAD    idx_X,  idx_Xk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_X,  idx_X,  param_shift_k;
+--:-:-:-:1      XMAD    idx_k,  idx_X,  param_k, RZ;
+--:-:-:-:1      IADD    idx_k, -idx_k,  idx_Xk;
+
+// idx_K = idx_K * blk_k + idx_k
+02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+--:-:-:-:1  @P0 STS [addr_idx_Y], idx_Y;
+--:-:-:-:1  @P0 STS [addr_idx_X], idx_X;
+--:-:-:-:1  @P0 STS [addr_idx_K], idx_K;
+
+
+--:-:-:-:1      LOP.AND  tid32_2,  tid,    -32;
+--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid32_2;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD3   readFs, readFs, tid1, tid32_2;
+--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;
+
+// c = (tid & 63) >> 5
+--:-:-:-:1      BFE.U32 c, tid, 0x105; // 2 bits at position 5
+
+// partialC  = (2 - partialC)
+// P6        = c < partialC
+// partialC *= 32*36 * itemsize
+--:-:-:-:1      IADD partialC, -partialC, 2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, c, partialC, PT;
+--:-:-:-:1      XMAD partialC,  partialC, 1x<32*36 * $dsize>, RZ;
+
+// writeS = (c*32*36 + (tid & 31)*4)*4
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      SHL writeS, tid31, 4;
+--:-:-:-:1      XMAD writeS, c, 4x<32*36>, writeS;
+
+// offset = c*32*36 + tid31*4
+--:-:-:-:1      SHL tid31, tid31, 2;
+--:-:-:-:1      XMAD offset, c, 1x<32*36>, tid31;
+
+
+// P5 = C > 2
+--:-:-:-:1      ISETP.GT.AND P5, PT, C, 2, PT;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P1 BRA.U FILTER_SETUP;
+
+##############################################################
+IMAGE_SETUP:
+
+--:-:1:-:1      S2R idx_N, SR_CTAID.Z;
+<SCHEDULE_BLOCK>
+// (GN,GYS,GXS,C,6,6,32)
+// offset += (idx_N*GYS*GXS*C*32*36 + idx_Y*GXS*C*32*36 + idx_X*C*32*36) * itemsize;
+--:-:-:-:1      XMAD.LO2C offset, idx_X, param_C_1152, offset;
+--:-:-:-:1      XMAD.LO2C offset, idx_Y, param_GXS_C_1152, offset;
+01:-:-:-:1      XMAD.LO2C offset, idx_N, param_GYS_GXS_C_1152, offset;
+--:-:-:-:1      LEA      track0.CC, offset, param_I[0],     [+ dshift() +];
+--:-:-:-:0      LEA.HI.X track1,    offset, param_I[1], RZ, [+ dshift() +];
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BRA.U LOAD;
+
+##############################################################
+FILTER_SETUP:
+
+<SCHEDULE_BLOCK>
+// writeS += 32*36*2*4
+--:-:-:-:1      IADD writeS, writeS, 4x<32*36*2>;
+
+// (kBlks,C,6,6,32)
+// offset += (idx_K*C*32*36) * itemsize;
+--:-:-:-:1      XMAD.LO2C offset, idx_K, param_C_1152, offset;
+--:-:-:-:1      LEA      track0.CC, offset, param_F[0],     [+ dshift() +];
+--:-:-:-:2      LEA.HI.X track1,    offset, param_F[1], RZ, [+ dshift() +];
+</SCHEDULE_BLOCK>
+
+##############################################################
+LOAD:
+
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>];
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>];
+--:-:2:-:1  @P6 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T0, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T1, [addr_zero];
+--:-:2:-:1 @!P6 LDS.U.[+ vsize() +] T2, [addr_zero];
+
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>];
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>];
+--:-:3:-:1  @P6 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T3, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T4, [addr_zero];
+--:-:3:-:1 @!P6 LDS.U.[+ vsize() +] T5, [addr_zero];
+
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>];
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>];
+--:-:4:-:1  @P6 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T6, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T7, [addr_zero];
+--:-:4:-:1 @!P6 LDS.U.[+ vsize() +] T8, [addr_zero];
+
+[+
+    our $convert_in;
+    return $convert_in ? q{
+
+02:-:-:-:1      F2F.F32.F16 T03, T01.H1;
+--:-:-:-:1      F2F.F32.F16 T02, T01.H0;
+--:-:-:-:1      F2F.F32.F16 T01, T00.H1;
+--:-:2:-:1      F2F.F32.F16 T00, T00.H0;
+
+--:-:-:-:1      F2F.F32.F16 T13, T11.H1;
+--:-:-:-:1      F2F.F32.F16 T12, T11.H0;
+--:-:-:-:1      F2F.F32.F16 T11, T10.H1;
+--:-:5:-:1      F2F.F32.F16 T10, T10.H0;
+
+--:-:-:-:1      F2F.F32.F16 T23, T21.H1;
+--:-:-:-:1      F2F.F32.F16 T22, T21.H0;
+--:-:-:-:1      F2F.F32.F16 T21, T20.H1;
+--:-:6:-:1      F2F.F32.F16 T20, T20.H0;
+
+02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
+
+04:-:-:-:1      F2F.F32.F16 T33, T31.H1;
+--:-:-:-:1      F2F.F32.F16 T32, T31.H0;
+--:-:-:-:1      F2F.F32.F16 T31, T30.H1;
+--:-:3:-:1      F2F.F32.F16 T30, T30.H0;
+
+10:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
+
+--:-:-:-:1      F2F.F32.F16 T43, T41.H1;
+--:-:-:-:1      F2F.F32.F16 T42, T41.H0;
+--:-:-:-:1      F2F.F32.F16 T41, T40.H1;
+--:-:5:-:1      F2F.F32.F16 T40, T40.H0;
+
+20:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
+
+--:-:-:-:1      F2F.F32.F16 T53, T51.H1;
+--:-:-:-:1      F2F.F32.F16 T52, T51.H0;
+--:-:-:-:1      F2F.F32.F16 T51, T50.H1;
+--:-:6:-:1      F2F.F32.F16 T50, T50.H0;
+
+04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
+
+08:-:-:-:1      F2F.F32.F16 T63, T61.H1;
+--:-:-:-:1      F2F.F32.F16 T62, T61.H0;
+--:-:-:-:1      F2F.F32.F16 T61, T60.H1;
+--:-:4:-:1      F2F.F32.F16 T60, T60.H0;
+
+10:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
+
+--:-:-:-:1      F2F.F32.F16 T73, T71.H1;
+--:-:-:-:1      F2F.F32.F16 T72, T71.H0;
+--:-:-:-:1      F2F.F32.F16 T71, T70.H1;
+--:-:5:-:1      F2F.F32.F16 T70, T70.H0;
+
+20:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
+
+--:-:-:-:1      F2F.F32.F16 T83, T81.H1;
+--:-:-:-:1      F2F.F32.F16 T82, T81.H0;
+--:-:-:-:1      F2F.F32.F16 T81, T80.H1;
+--:-:6:-:1      F2F.F32.F16 T80, T80.H0;
+
+08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
+10:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
+20:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
+
+    } : q{
+02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
+--:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
+--:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
+04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
+--:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
+--:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
+08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
+--:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
+--:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
+    };
++]
+
+--:-:-:-:0      IADD   track0.CC, track0, partialC;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X track1,    track1, RZ;
+
+--:-:-:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:1:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>];
+--:-:2:-:1  @P5 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>];
+--:-:3:-:1  @P5 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>];
+--:6:4:-:1  @P5 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>];
+
+--:-:-:-:5      BRA.U LOAD_LOOP;
+
+##############################################################
+
+COMPUTE_SETUP:
+
+<SCHEDULE_BLOCK>
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+--:-:-:-:1      IADD tid128, tid, -128;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,   1;
+
+--:-:-:-:1      BFE.U32  readIs, tid128, 0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
+--:-:-:-:1      ISCADD   readIs, readIs, 4x<32*4>, 4;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
+--:-:-:-:1      LOP.AND  readFs, tid128, 8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      IADD3    readFs, readFs, tid16, tid_1;
+--:-:-:-:0      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>];
+--:-:1:-:2      LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+COMPUTE_LOOP:
+[+
+    my %insert = (
+
+        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -2;\n",
+
+        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readFs, readFs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,   -swapBuf;\n",
+
+        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        my $odd    = $j;
+        my $nOdd   = 1 - $j;
+        my $rsPred = $j == 1 ? '@P0' : '   ';
+        my $bar    = $j == 0 ? '2' : '-';
+
+        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;
+
+        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;
+
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $yield  = $c % 10 == 0 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+LOAD_LOOP:
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;
+20:-:-:-:1      IADD track0.CC, track0, 1x<32*36*2 * $dsize>;
+--:-:-:-:1      ISETP.GT.AND P1, PT, C, 4, PT;
+--:-:-:-:1      IADD C, C, -2;
+[+
+    our ($vsize, $dsize, $convert_in);
+    my %insert = (
+
+        j0c3 => "--:-:-:-:1      IADD.X track1, track1, RZ;\n",
+
+        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n",
+        j0c2  => "--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n",
+        j0c18 => "--:-:1:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+
+        j1c12 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n",
+        j1c14 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n",
+        j1c16 => "--:-:1:-:1  \@P0 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n",
+
+        $convert_in ? (
+
+            j0c1  => "02:-:-:-:1      F2F.F32.F16 T03, T01.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T02, T01.H0;\n",
+            j0c4  => "--:-:-:-:1      F2F.F32.F16 T01, T00.H1;\n" .
+                     "--:-:2:-:1      F2F.F32.F16 T00, T00.H0;\n",
+
+            j0c5  => "--:-:-:-:1      F2F.F32.F16 T13, T11.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T12, T11.H0;\n",
+            j0c6  => "--:-:-:-:1      F2F.F32.F16 T11, T10.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T10, T10.H0;\n",
+
+            j0c7  => "--:-:-:-:1      F2F.F32.F16 T23, T21.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T22, T21.H0;\n",
+            j0c8  => "--:-:-:-:1      F2F.F32.F16 T21, T20.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T20, T20.H0;\n",
+
+            j0c9  => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n",
+            j0c10 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
+            j0c11 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",
+
+            j0c13 => "02:-:-:-:1  \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n",
+            j0c14 => "10:-:-:-:1  \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n",
+            j0c15 => "20:-:2:-:1  \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n",
+
+            j0c16 => "04:-:-:-:1      F2F.F32.F16 T33, T31.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T32, T31.H0;\n",
+            j0c17 => "--:-:-:-:1      F2F.F32.F16 T31, T30.H1;\n" .
+                     "--:-:3:-:1      F2F.F32.F16 T30, T30.H0;\n",
+
+            j0c19 => "--:-:-:-:1      F2F.F32.F16 T43, T41.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T42, T41.H0;\n",
+            j0c20 => "--:-:-:-:1      F2F.F32.F16 T41, T40.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T40, T40.H0;\n",
+
+            j0c21 => "--:-:-:-:1      F2F.F32.F16 T53, T51.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T52, T51.H0;\n",
+            j0c22 => "--:-:-:-:1      F2F.F32.F16 T51, T50.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T50, T50.H0;\n",
+
+            j0c23 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
+            j0c24 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
+            j0c25 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",
+
+            j0c27 => "04:-:-:-:1  \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n",
+            j0c28 => "10:-:-:-:1  \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n",
+            j0c29 => "20:-:3:-:1  \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n",
+
+            j0c30 => "08:-:-:-:1      F2F.F32.F16 T63, T61.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T62, T61.H0;\n",
+            j0c31 => "--:-:-:-:1      F2F.F32.F16 T61, T60.H1;\n" .
+                     "--:-:4:-:1      F2F.F32.F16 T60, T60.H0;\n",
+
+            j1c0  => "--:-:-:-:1      F2F.F32.F16 T73, T71.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T72, T71.H0;\n",
+            j1c1  => "--:-:-:-:1      F2F.F32.F16 T71, T70.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T70, T70.H0;\n",
+
+            j1c2  => "--:-:-:-:1      F2F.F32.F16 T83, T81.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T82, T81.H0;\n",
+            j1c3  => "--:-:-:-:1      F2F.F32.F16 T81, T80.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T80, T80.H0;\n",
+
+            j1c4  => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
+            j1c5  => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
+            j1c6  => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",
+
+            j1c8  => "08:-:-:-:1  \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n",
+            j1c9  => "10:-:-:-:1  \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n",
+            j1c10 => "20:6:4:-:1  \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n",
+
+        ) : (
+
+            j0c6  => "02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;\n",
+            j0c8  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
+            j0c10 => "--:2:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",
+
+            j0c12 => "02:-:-:-:1  \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n",
+            j0c14 => "--:-:-:-:1  \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n",
+            j0c16 => "--:-:2:-:1  \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n",
+
+            j0c20 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
+            j0c22 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
+            j0c24 => "--:3:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",
+
+            j0c26 => "04:-:-:-:1  \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n",
+            j0c28 => "--:-:-:-:1  \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n",
+            j0c30 => "--:-:3:-:1  \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n",
+
+            j1c0  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
+            j1c2  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
+            j1c4  => "--:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",
+
+            j1c6  => "08:-:-:-:1  \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n",
+            j1c8  => "--:-:-:-:1  \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n",
+            j1c10 => "--:6:4:-:1  \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n",
+        ),
+
+        j1c11 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c31 => "--:-:-:Y:5  \@P0 BRA.U LOAD_LOOP;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "01" : '--';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+[-
+    our $trans1 = "0.343";
+    our $trans2 = "0.700";
+    our $trans3 = "0.490";
+-]
+
+<INCLUDE file="xconv_winograd_4x4_3x3_32x32_common.sass"/>
diff --git a/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_common.sass b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_common.sass
new file mode 100644
index 0000000..f2a06e6
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_winograd_4x4_3x3_32x32_common.sass
@@ -0,0 +1,807 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+--:-:1:-:2      S2R Tid, SR_TID.X;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha16, param_alpha;
+
+01:-:-:-:1      LOP.AND  Tid32_2,  Tid,    -32;
+--:-:-:-:1      SHR.U32  Tid32_2,  Tid32_2, 2;
+
+// readFs = ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND Tid1,   Tid,    1;
+01:-:-:-:1      LOP.AND readFs, Tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD    readFs, readFs, Tid1;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readFs << 2)
+--:-:-:-:1      BFE.U32 readIs, Tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, Tid32_2;
+--:-:-:-:1      ISCADD  readIs, readFs, readIs, 2;
+
+--:-:-:-:1      SHL readIs, readIs, 4;
+--:-:-:-:1      SHL readFs, readFs, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:1      XMAD write16Cs, readFs, 1x<32*36>, readIs;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y2, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y2, alpha16;
+--:-:-:-:4      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y3, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y3, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y6, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y6, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y7, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y7, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      EXIT;
+
+COMPUTE_FINISH:
+
+--:-:1:-:2      S2R tid_128, SR_TID.X;
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+01:-:-:-:1      IADD tid_128, tid_128, -128;
+
+--:-:-:-:1      ISETP.GE.AND P4, PT, tid_128, 256, PT;
+
+// readFs = ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  Tid_1,   tid_128, 1;
+--:-:-:-:1      LOP.AND  readFs2, tid_128, 8;
+--:-:-:-:1      SHR.U32  readFs2, readFs2, 2;
+--:-:-:-:1      IADD     readFs2, readFs2, Tid_1;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2)
+--:-:-:-:1      LOP.AND  tid_16,   tid_128, -16;
+--:-:-:-:1      SHR.U32  tid_16,   tid_16,   1;
+--:-:-:-:1      BFE.U32  readIs2,  tid_128,  0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs2,  readIs2,  tid_16;
+--:-:-:-:1      ISCADD   readIs2,  readFs2, readIs2, 2;
+
+--:-:-:-:1      ISCADD   readIs2, readIs2, 4x<32*4>, 4;
+--:-:-:-:1      SHL      readFs2, readFs2, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:0      XMAD writeCs, readFs2, 1x<32*36>, readIs2;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P4 BRA.U SKIP0;
+
+--:-:2:-:1      LDS idxX, [addr_idx_X];
+--:-:3:-:1      LDS idxY, [addr_idx_Y];
+--:-:1:-:1      S2R idxN,  SR_CTAID.Z;
+--:-:4:-:1      LDS idxK, [addr_idx_K];
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND tid_31, tid_128, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid_128,  5;
+--:-:-:-:1      SHR.U32 tid_64, tid_128,  6;
+
+[+
+    our $bsum; return $bsum ? q{
+03:-:-:-:1      XMAD      bsum_offset, idxX, param_gridN,   idxN;
+04:-:-:-:1      XMAD.LO2C bsum_offset, idxY, param_gridQN,  bsum_offset;
+    } : '';
++]
+
+--:-:-:-:1      MOV32I one, 1.0;
+
+// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16
+--:-:-:-:1      XMAD   readCs, tid_32, 1x<32*36>, tid_31;
+--:-:-:-:1      ISCADD readCs, tid_64, readCs, 4;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = idxN*32 + tid & maskN
+--:-:-:-:1      LOP.AND n, tid_31, param_maskN;
+01:-:-:-:1      ISCADD  n, idxN, n, 5;
+
+// Superblock offset
+// idxX <<= shiftX
+// idxX <<= shiftY
+02:-:-:-:1      SHL idxX, idxX, param_shiftX;
+04:-:-:-:1      SHL idxY, idxY, param_shiftY;
+
+// Get this threads offset within the superblock
+--:-:-:-:1      BFE.U32 q, tid_31, param_superX;
+--:-:-:-:1      BFE.U32 p, tid_31, param_superY;
+--:-:-:-:1      ISCADD q, q, idxX, 2;
+--:-:-:-:1      ISCADD p, p, idxY, 2;
+
+// k = idxK*32 + tid_32<<1
+--:-:-:-:1      SHL tid_32, tid_32,   1;
+08:-:-:-:1      ISCADD k, idxK, tid_32, 5;
+
+// Out = k*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD      offsetO, q, param_N,    n;
+--:-:-:-:1      XMAD.LO2C offsetO, p, param_QN,   offsetO;
+--:-:-:-:1      XMAD.LO2C offsetO, k, param_PQN,  offsetO;
+
+--:-:-:-:1      IADD z1, q, 1;
+--:-:-:-:1      IADD z2, q, 2;
+--:-:-:-:1      IADD z3, q, 3;
+
+--:-:-:-:1      ISETP.EQ.AND P5, PT, RZ, param_flags, PT; // ! no-op
+--:-:-:-:1      ISETP.LT.AND P6, PT, n, param_N, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, q,  param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_Q, P5;
+--:-:-:-:1      ISETP.GE.AND P0, PT, q,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+--:-:-:-:1      P2R mask_q, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD z1, p, 1;
+--:-:-:-:1      IADD z2, p, 2;
+--:-:-:-:1      IADD z3, p, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, p,  param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_P, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, p,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+
+--:-:-:-:1      SEL preds, mask_q, RZ, P0;
+--:-:-:-:1  @P1 BFI preds, mask_q, 0x404, preds;
+--:-:-:-:1  @P2 BFI preds, mask_q, 0x408, preds;
+--:-:-:-:1  @P3 BFI preds, mask_q, 0x40c, preds;
+
+--:-:-:-:1      ISETP.EQ.AND P6, PT, tid_31, RZ, PT;
+</SCHEDULE_BLOCK>
+
+SKIP0:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, ccx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, ccx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, ccx7y2, alpha;
+
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP1;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 1;
+--:-:-:-:1      IADD offsetO, offsetO, param_PQN;
+
+SKIP1:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y1, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP2;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 15;
+--:-:-:-:1      IADD offsetO, offsetO, param_PQN15;
+
+SKIP2:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y4, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y6, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP3;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 1;
+--:-:-:-:1      IADD offsetO, offsetO, param_PQN;
+
+SKIP3:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y5, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP4;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP4:
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+<SCHEDULE_BLOCK>
+11:-:-:-:1      ISETP.LT.AND P5, PT, k, param_K, PT;
+[+
+    our $bias;
+    return $bias ? q{
+--:-:-:-:1      LEA      Sum0.CC, k, param_S[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    k, param_S[1], RZ, 2;
+
+--:-:-:-:1 @!P5 MOV bias, RZ;
+--:-:5:-:1  @P5 LDG.E.CI bias, [Sum];
+    } : '';
++]
+</SCHEDULE_BLOCK>
+
+[+
+    my $out;
+    foreach my $i (0 .. 2)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $i + 1;
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+[+
+    my $out; our ($trans1, $trans2, $trans3);
+    foreach my $i (0 .. 2)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+<ORDERED>
+$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
+--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD t2$i, m3$i, -m4$i;
+--:-:-:-:1      FADD t3$i, m3$i,  m4$i;
+--:-:-:-:1      FADD w0$i, t0$i,  m0$i;
+--:-:-:-:1      FMUL32I w3$i, t1$i, $trans1;
+--:-:-:-:1      FMUL32I w1$i, t1$i, $trans2;
+--:-:-:-:1      FMUL32I temp, t0$i, $trans3;
+--:-:-:-:1      FFMA w3$i, t2$i,  3.375, w3$i;
+--:-:-:-:1      FFMA w1$i, t2$i,  1.500, w1$i;
+--:-:-:-:1      FFMA w2$i, t3$i,  2.250, temp;
+--:-:-:-:1      FADD w0$i, w0$i,  t3$i;
+--:-:-:-:1      FADD w3$i, w3$i,  m5$i;
+</ORDERED>
+        };
+    }
+    foreach my $i (3 .. 5)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $i + 1;
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out; our ($trans1, $trans2, $trans3);
+
+    foreach my $i (3 .. 5)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+<ORDERED>
+$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
+--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD t2$i, m3$i, -m4$i;
+--:-:-:-:1      FADD t3$i, m3$i,  m4$i;
+--:-:-:-:1      FADD w0$i, t0$i,  m0$i;
+--:-:-:-:1      FMUL32I w3$i, t1$i, $trans1;
+--:-:-:-:1      FMUL32I w1$i, t1$i, $trans2;
+--:-:-:-:1      FMUL32I temp, t0$i, $trans3;
+--:-:-:-:1      FFMA w3$i, t2$i,  3.375, w3$i;
+--:-:-:-:1      FFMA w1$i, t2$i,  1.500, w1$i;
+--:-:-:-:1      FFMA w2$i, t3$i,  2.250, temp;
+--:-:-:-:1      FADD w0$i, w0$i,  t3$i;
+--:-:-:-:1      FADD w3$i, w3$i,  m5$i;
+</ORDERED>
+        };
+    }
+    return $out;
++]
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    our ($convert_out, $bias, $relu, $trans1, $trans2, $trans3);
+    foreach my $i (0 .. 3)
+    {
+        $out .= qq{
+--:-:-:-:1      FADD r${i}0, w${i}1,  w${i}2;
+--:-:-:-:1      FADD r${i}1, w${i}1, -w${i}2;
+--:-:-:-:1      FADD r${i}2, w${i}3, -w${i}4;
+--:-:-:-:1      FADD r${i}3, w${i}3,  w${i}4;
+--:-:-:-:1      FADD s${i}0, r${i}0,  w${i}0;
+--:-:-:-:1      FMUL32I s${i}3, r${i}1, $trans1;
+--:-:-:-:1      FMUL32I s${i}1, r${i}1, $trans2;
+--:-:-:-:1      FMUL32I temp,   r${i}0, $trans3;
+--:-:-:-:1      FFMA s${i}3, r${i}2,  3.375, s${i}3;
+--:-:-:-:1      FFMA s${i}1, r${i}2,  1.500, s${i}1;
+--:-:-:-:1      FFMA s${i}2, r${i}3,  2.250, temp;
+--:-:-:-:1      FADD s${i}0, s${i}0,  r${i}3;
+--:-:-:-:1      FADD s${i}3, s${i}3,  w${i}5;
+        };
+        if ($bias)
+        {
+            $out .= qq{
+10:-:-:-:1      FADD s${i}0, s${i}0, bias;
+--:-:-:-:1      FADD s${i}1, s${i}1, bias;
+--:-:-:-:1      FADD s${i}2, s${i}2, bias;
+--:-:-:-:1      FADD s${i}3, s${i}3, bias;};
+        }
+        if ($relu)
+        {
+            $out .= qq{
+--:-:-:-:1      FMNMX s${i}0, s${i}0, RZ, !PT;
+--:-:-:-:1      FMNMX s${i}1, s${i}1, RZ, !PT;
+--:-:-:-:1      FMNMX s${i}2, s${i}2, RZ, !PT;
+--:-:-:-:1      FMNMX s${i}3, s${i}3, RZ, !PT;};
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+<SCHEDULE_BLOCK>
+[+
+    our $prelu; my $out;
+    if ($prelu)
+    {
+        foreach my $i (0 .. 3)
+        {
+            $out .= qq{
+// maximum(x, 0) + beta * minimum(0, x)
+--:-:-:-:1      FMNMX b00, s${i}0, RZ, !PT;
+--:-:-:-:1      FMNMX b01, s${i}1, RZ, !PT;
+--:-:-:-:1      FMNMX b02, s${i}2, RZ, !PT;
+--:-:-:-:1      FMNMX b03, s${i}3, RZ, !PT;
+
+--:-:-:-:1      FMNMX b10, s${i}0, RZ, PT;
+--:-:-:-:1      FMNMX b11, s${i}1, RZ, PT;
+--:-:-:-:1      FMNMX b12, s${i}2, RZ, PT;
+--:-:-:-:1      FMNMX b13, s${i}3, RZ, PT;
+
+--:-:-:-:1      FFMA s${i}0, b10, param_beta, b00;
+--:-:-:-:1      FFMA s${i}1, b11, param_beta, b01;
+--:-:-:-:1      FFMA s${i}2, b12, param_beta, b02;
+--:-:-:-:1      FFMA s${i}3, b13, param_beta, b03;
+            };
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+[+
+    our ($beta, $brelu, $bprelu, $dtype, $dsize, $dshift, $convert_out, $Q, $N);
+    my $out;
+    if ($beta || $brelu || $bprelu)
+    {
+        my $preds = $beta ? q{
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+        } : '';
+
+        $out .= qq{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out0.CC, offsetO, param_X[0],     $dshift;
+--:-:-:-:1      LEA.HI.X Out1,    offsetO, param_X[1], RZ, $dshift;
+
+--:-:-:-:1  \@P0 LDG.E.CG.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>];
+--:-:-:-:1  \@P1 LDG.E.CG.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CG.$dtype b02, [Out + ${dsize}x<0*$Q*$N + 2*$N>];
+--:-:1:-:1  \@P3 LDG.E.CG.$dtype b03, [Out + ${dsize}x<0*$Q*$N + 3*$N>];
+--:-:-:-:1 \@!P0 MOV b00, RZ;
+--:-:-:-:1 \@!P1 MOV b01, RZ;
+--:-:-:-:1 \@!P2 MOV b02, RZ;
+--:-:-:-:1 \@!P3 MOV b03, RZ;
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+
+--:-:-:-:1  \@P0 LDG.E.CG.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>];
+--:-:-:-:1  \@P1 LDG.E.CG.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CG.$dtype b12, [Out + ${dsize}x<1*$Q*$N + 2*$N>];
+--:-:2:-:1  \@P3 LDG.E.CG.$dtype b13, [Out + ${dsize}x<1*$Q*$N + 3*$N>];
+--:-:-:-:1 \@!P0 MOV b10, RZ;
+--:-:-:-:1 \@!P1 MOV b11, RZ;
+--:-:-:-:1 \@!P2 MOV b12, RZ;
+--:-:-:-:1 \@!P3 MOV b13, RZ;
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+
+--:-:-:-:1  \@P0 LDG.E.CG.$dtype b20, [Out + ${dsize}x<2*$Q*$N + 0*$N>];
+--:-:-:-:1  \@P1 LDG.E.CG.$dtype b21, [Out + ${dsize}x<2*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CG.$dtype b22, [Out + ${dsize}x<2*$Q*$N + 2*$N>];
+--:-:3:-:1  \@P3 LDG.E.CG.$dtype b23, [Out + ${dsize}x<2*$Q*$N + 3*$N>];
+--:-:-:-:1 \@!P0 MOV b20, RZ;
+--:-:-:-:1 \@!P1 MOV b21, RZ;
+--:-:-:-:1 \@!P2 MOV b22, RZ;
+--:-:-:-:1 \@!P3 MOV b23, RZ;
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.L.U64 preds, preds, 12, preds;
+
+--:-:-:-:1  \@P0 LDG.E.CG.$dtype b30, [Out + ${dsize}x<3*$Q*$N + 0*$N>];
+--:-:-:-:1  \@P1 LDG.E.CG.$dtype b31, [Out + ${dsize}x<3*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CG.$dtype b32, [Out + ${dsize}x<3*$Q*$N + 2*$N>];
+--:-:4:-:1  \@P3 LDG.E.CG.$dtype b33, [Out + ${dsize}x<3*$Q*$N + 3*$N>];
+--:-:-:-:1 \@!P0 MOV b30, RZ;
+--:-:-:-:1 \@!P1 MOV b31, RZ;
+--:-:-:-:1 \@!P2 MOV b32, RZ;
+--:-:-:-:1 \@!P3 MOV b33, RZ;$preds
+</SCHEDULE_BLOCK>};
+
+        if ($convert_out)
+        {
+            $out .= q{
+01:-:-:-:1      F2F.F32.F16 b00, b00;
+--:-:-:-:1      F2F.F32.F16 b01, b01;
+--:-:-:-:1      F2F.F32.F16 b02, b02;
+--:-:1:-:1      F2F.F32.F16 b03, b03;
+02:-:-:-:1      F2F.F32.F16 b10, b10;
+--:-:-:-:1      F2F.F32.F16 b11, b11;
+--:-:-:-:1      F2F.F32.F16 b12, b12;
+--:-:2:-:1      F2F.F32.F16 b13, b13;
+04:-:-:-:1      F2F.F32.F16 b20, b20;
+--:-:-:-:1      F2F.F32.F16 b21, b21;
+--:-:-:-:1      F2F.F32.F16 b22, b22;
+--:-:3:-:1      F2F.F32.F16 b23, b23;
+08:-:-:-:1      F2F.F32.F16 b30, b30;
+--:-:-:-:1      F2F.F32.F16 b31, b31;
+--:-:-:-:1      F2F.F32.F16 b32, b32;
+--:-:4:-:1      F2F.F32.F16 b33, b33;};
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+[+
+    our $beta; return $beta ? q{
+01:-:-:-:1      FFMA s00, b00, param_beta, s00;
+--:-:-:-:1      FFMA s01, b01, param_beta, s01;
+--:-:-:-:1      FFMA s02, b02, param_beta, s02;
+--:-:-:-:1      FFMA s03, b03, param_beta, s03;
+02:-:-:-:1      FFMA s10, b10, param_beta, s10;
+--:-:-:-:1      FFMA s11, b11, param_beta, s11;
+--:-:-:-:1      FFMA s12, b12, param_beta, s12;
+--:-:-:-:1      FFMA s13, b13, param_beta, s13;
+04:-:-:-:1      FFMA s20, b20, param_beta, s20;
+--:-:-:-:1      FFMA s21, b21, param_beta, s21;
+--:-:-:-:1      FFMA s22, b22, param_beta, s22;
+--:-:-:-:1      FFMA s23, b23, param_beta, s23;
+08:-:-:-:1      FFMA s30, b30, param_beta, s30;
+--:-:-:-:1      FFMA s31, b31, param_beta, s31;
+--:-:-:-:1      FFMA s32, b32, param_beta, s32;
+--:-:-:-:1      FFMA s33, b33, param_beta, s33;} : '';
++]
+[+
+    our ($brelu, $bprelu); my $out;
+    if ($brelu || $bprelu)
+    {
+        foreach my $i (0 .. 3)
+        {
+            my $w = sprintf "%02x", 1 << $i;
+            $out .= $brelu ? qq{
+//delta *= (x > 0)
+$w:-:-:-:1      FSETP.GT.AND P0, PT, b${i}0, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b${i}1, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b${i}2, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b${i}3, RZ, PT;
+--:-:-:-:1 \@!P0 MOV s${i}0, RZ;
+--:-:-:-:1 \@!P1 MOV s${i}1, RZ;
+--:-:-:-:1 \@!P2 MOV s${i}2, RZ;
+--:-:-:-:1 \@!P3 MOV s${i}3, RZ;
+            } : qq{
+//delta *= ((x > 0) + slope * (x < 0))
+$w:-:-:-:1      FSETP.GT.AND P0, PT, b${i}0, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b${i}1, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b${i}2, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b${i}3, RZ, PT;
+--:-:-:-:1      SEL xx0, one, RZ, P0;
+--:-:-:-:1      SEL xx1, one, RZ, P1;
+--:-:-:-:1      SEL xx2, one, RZ, P2;
+--:-:-:-:1      SEL xx3, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b${i}0, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b${i}1, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b${i}2, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b${i}3, RZ, PT;
+--:-:-:-:1      SEL b${i}0, one, RZ, P0;
+--:-:-:-:1      SEL b${i}1, one, RZ, P1;
+--:-:-:-:1      SEL b${i}2, one, RZ, P2;
+--:-:-:-:1      SEL b${i}3, one, RZ, P3;
+--:-:-:-:1      FFMA b${i}0, b${i}0, param_beta, xx0;
+--:-:-:-:1      FFMA b${i}1, b${i}1, param_beta, xx1;
+--:-:-:-:1      FFMA b${i}2, b${i}2, param_beta, xx2;
+--:-:-:-:1      FFMA b${i}3, b${i}3, param_beta, xx3;
+--:-:-:-:1      FMUL s${i}0, s${i}0, b${i}0;
+--:-:-:-:1      FMUL s${i}1, s${i}1, b${i}1;
+--:-:-:-:1      FMUL s${i}2, s${i}2, b${i}2;
+--:-:-:-:1      FMUL s${i}3, s${i}3, b${i}3;
+            };
+        }
+        $out .= q{
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:5 @!P5 R2P PR, RZ, 0x0f;
+--:-:-:-:5      SHF.R.U64 preds, preds, 4, preds;
+};
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+[+
+    our $bsum; my $out;
+    if ($bsum)
+    {
+        $out = q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      XMAD.LO2C bias, k, param_gridPQN, bsum_offset;
+--:-:-:-:1      LEA      Sum0.CC, bias, param_S[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    bias, param_S[1], RZ, 2;
+--:-:-:-:1      MOV  sum0, RZ;
+--:-:-:-:1      MOV  sum1, RZ;
+--:-:-:-:1      MOV  sum2, RZ;
+--:-:-:-:1      MOV  sum3, RZ;};
+        foreach my $i (0 .. 3)
+        {
+            my ($dir, $amt) = $i == 2 ? ('L','12') : ('R','4');
+            $out .= qq{
+--:-:-:-:1  \@P0 FADD sum0, sum0, s${i}0;
+--:-:-:-:1  \@P1 FADD sum1, sum1, s${i}1;
+--:-:-:-:1  \@P2 FADD sum2, sum2, s${i}2;
+--:-:-:-:1  \@P3 FADD sum3, sum3, s${i}3;
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.$dir.U64 preds, preds, $amt, preds;};
+        }
+        $out .= q{
+--:-:-:-:1      FADD sum0, sum0, sum1;
+--:-:-:-:1      FADD sum2, sum2, sum3;
+--:-:-:-:1      FADD sum0, sum0, sum2;
+</SCHEDULE_BLOCK>};
+    }
+    return $out;
++]
+[+
+    our $convert_out; return $convert_out ? q{
+--:-:-:-:1      F2F.F16.F32 s00, s00;
+--:-:-:-:1      F2F.F16.F32 s01, s01;
+--:-:-:-:1      F2F.F16.F32 s02, s02;
+--:-:1:-:1      F2F.F16.F32 s03, s03;
+--:-:-:-:1      F2F.F16.F32 s10, s10;
+--:-:-:-:1      F2F.F16.F32 s11, s11;
+--:-:-:-:1      F2F.F16.F32 s12, s12;
+--:-:2:-:1      F2F.F16.F32 s13, s13;
+--:-:-:-:1      F2F.F16.F32 s20, s20;
+--:-:-:-:1      F2F.F16.F32 s21, s21;
+--:-:-:-:1      F2F.F16.F32 s22, s22;
+--:-:3:-:1      F2F.F16.F32 s23, s23;
+--:-:-:-:1      F2F.F16.F32 s30, s30;
+--:-:-:-:1      F2F.F16.F32 s31, s31;
+--:-:-:-:1      F2F.F16.F32 s32, s32;
+--:-:4:-:1      F2F.F16.F32 s33, s33;} : '';
++]
+
+[+
+    our ($bsum, $dtype, $dsize, $dshift, $Q, $N);
+    return $bsum ? qq{
+--:-:-:Y:6      LEA      Out0.CC, offsetO, param_O[0],     $dshift;
+--:-:-:-:0      LEA.HI.X Out1,    offsetO, param_O[1], RZ, $dshift;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  1, 0x1f;
+01:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03;
+--:-:-:-:2  \@P5 R2P PR, preds, 0x0f;
+--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f;
+
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:-:-:0      SHF.R.U64 preds, preds, 4, preds;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 2, 0x1f;
+
+02:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13;
+--:-:-:-:2  \@P5 R2P PR, preds, 0x0f;
+--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f;
+
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:-:-:0      SHF.R.U64 preds, preds, 4, preds;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 4, 0x1f;
+
+04:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23;
+--:-:-:-:2  \@P5 R2P PR, preds, 0x0f;
+--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f;
+
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:-:-:0      SHF.L.U64 preds, preds, 12, preds;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 8, 0x1f;
+
+08:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32;
+--:1:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33;
+
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:-:-:0      PSETP.AND.AND P5, PT, P5, P6, PT; // k < K && tid31 == 0
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 16, 0x1f;
+10:-:-:-:2      FADD sum0, sum1, sum0;
+--:5:-:-:1  \@P5 STG.E.CG [Sum], sum0;
+    } : qq{
+<SCHEDULE_BLOCK>
+<ORDERED>
+--:-:-:-:1      LEA      Out0.CC, offsetO, param_O[0],     $dshift;
+--:-:-:-:1      LEA.HI.X Out1,    offsetO, param_O[1], RZ, $dshift;
+
+01:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03;
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+
+02:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13;
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+
+04:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23;
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.L.U64 preds, preds, 12, preds;
+
+08:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32;
+--:1:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33;
+</ORDERED>
+</SCHEDULE_BLOCK>
+    };
++]
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Maxwell/xconv_xprop_common.sass b/Kernel/Convolution/Maxwell/xconv_xprop_common.sass
new file mode 100644
index 0000000..110dc4d
--- /dev/null
+++ b/Kernel/Convolution/Maxwell/xconv_xprop_common.sass
@@ -0,0 +1,841 @@
+
+
+[-
+    # Kernel Options:
+    our ($beta, $bias, $relu, $prelu, $brelu, $bprelu, $bsum);
+
+    # set externally
+    our ($prefix, $prop, $shareI, $shareF, $stepI, $stepF, $remapI, $remapF);
+
+    our $addr_shift = $prefix eq 's' ? 2 : 1;
+    our $half = $prefix eq 'h';
+
+    sub params
+    {
+        return <<'EOF';
+    param_Sum[0]       : c[0x0][0x140]
+    param_Sum[1]       : c[0x0][0x144]
+    param_X[0]         : c[0x0][0x148]
+    param_X[1]         : c[0x0][0x14c]
+    param_O[0]         : c[0x0][0x150]
+    param_O[1]         : c[0x0][0x154]
+    param_I[0]         : c[0x0][0x158]
+    param_I[1]         : c[0x0][0x15c]
+    param_F[0]         : c[0x0][0x160]
+    param_F[1]         : c[0x0][0x164]
+    param_alpha        : c[0x0][0x168]
+    param_beta         : c[0x0][0x16c]
+    param_flags        : c[0x0][0x170]
+    param_N            : c[0x0][0x174]
+    param_K            : c[0x0][0x178]
+    param_D            : c[0x0][0x17c]
+    param_H            : c[0x0][0x180]
+    param_W            : c[0x0][0x184]
+    param_WN           : c[0x0][0x188]
+    param_HWN          : c[0x0][0x18c]
+    param_DHWN         : c[0x0][0x190]
+    param_C            : c[0x0][0x194]
+    param_KRST         : c[0x0][0x198]
+    param_RST          : c[0x0][0x19c]
+    param_RS           : c[0x0][0x1a0]
+    param_T            : c[0x0][0x1a4]
+    param_R            : c[0x0][0x1a8]
+    param_S            : c[0x0][0x1ac]
+    param_magic_RS     : c[0x0][0x1b0]
+    param_shift_RS     : c[0x0][0x1b4]
+    param_magic_S      : c[0x0][0x1b8]
+    param_shift_S      : c[0x0][0x1bc]
+    param_pad_d        : c[0x0][0x1c0]
+    param_pad_h        : c[0x0][0x1c4]
+    param_pad_w        : c[0x0][0x1c8]
+    param_str_d        : c[0x0][0x1cc]
+    param_str_h        : c[0x0][0x1d0]
+    param_str_w        : c[0x0][0x1d4]
+    param_dil_d        : c[0x0][0x1d8]
+    param_dil_h        : c[0x0][0x1dc]
+    param_dil_w        : c[0x0][0x1e0]
+    param_P2           : c[0x0][0x1e4]
+    param_Q            : c[0x0][0x1e8]
+    param_PQk          : c[0x0][0x1ec]
+    param_Qk           : c[0x0][0x1f0]
+    param_k            : c[0x0][0x1f4]
+    param_magic_PQk    : c[0x0][0x1f8]
+    param_shift_PQk    : c[0x0][0x1fc]
+    param_magic_Qk     : c[0x0][0x200]
+    param_shift_Qk     : c[0x0][0x204]
+    param_magic_k      : c[0x0][0x208]
+    param_shift_k      : c[0x0][0x20c]
+    param_QN           : c[0x0][0x210]
+    param_PQN          : c[0x0][0x214]
+    param_MPQN         : c[0x0][0x218]
+    param_gridN        : c[0x0][0x21c]
+    param_gridQN       : c[0x0][0x220]
+    param_gridPQN      : c[0x0][0x224]
+    param_gridMPQN     : c[0x0][0x228]
+    param_magic_str_d  : c[0x0][0x22c]
+    param_shift_str_d  : c[0x0][0x230]
+    param_magic_str_h  : c[0x0][0x234]
+    param_shift_str_h  : c[0x0][0x238]
+    param_magic_str_w  : c[0x0][0x23c]
+    param_shift_str_w  : c[0x0][0x240]
+EOF
+    }
+
+    sub get_mpqk
+    {
+        return <<'EOF';
+// idx_M = idx_MPQk / blk_PQk
+--:-:-:-:1      MOV  magic_PQk, param_magic_PQk;
+--:-:-:-:1      ISETP.NE.AND P1, PT,   magic_PQk, 1, PT;
+02:-:-:-:1  @P1 XMAD     div1, idx_MPQk,    magic_PQk,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, idx_MPQk,    magic_PQk.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, idx_MPQk.H1, magic_PQk.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, idx_MPQk.H1, magic_PQk,    div1;
+--:-:-:-:1  @P1 IADD3.RS idx_M, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  idx_M, idx_M,    param_shift_PQk;
+--:-:-:-:1 @!P1 SHR.U32  idx_M, idx_MPQk, param_shift_PQk;
+
+// idx_PQk = idx_PQk % blk_Qk
+--:-:-:-:1      IADD neg_PQk, RZ, -param_PQk;
+--:-:-:-:1      XMAD.LO2 idx_PQk, neg_PQk, idx_M, idx_MPQk;
+
+// idx_P2 = idx_PQk / blk_Qk
+--:-:-:-:1      MOV  magic_Qk, param_magic_Qk;
+--:-:-:-:1      ISETP.NE.AND P2, PT,  magic_Qk, 1, PT;
+--:-:-:-:1  @P2 XMAD     div1, idx_PQk,    magic_Qk,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, idx_PQk,    magic_Qk.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, idx_PQk.H1, magic_Qk.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, idx_PQk.H1, magic_Qk,    div1;
+--:-:-:-:1  @P2 IADD3.RS idx_P2, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  idx_P2, idx_P2,  param_shift_Qk;
+--:-:-:-:1 @!P2 SHR.U32  idx_P2, idx_PQk, param_shift_Qk;
+
+// idx_Qk = idx_PQk % blk_Qk
+--:-:-:-:1      IADD neg_Qk, RZ, -param_Qk;
+--:-:-:-:1      XMAD.LO2 idx_Qk, neg_Qk, idx_P2, idx_PQk;
+
+// idx_Q2  = idx_Qk / k
+--:-:-:-:1      XMAD.LO2C idx_Q2, idx_Qk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32   idx_Q2, idx_Q2, param_shift_k;
+// idx_k = idx_Qk % k
+--:-:-:-:1      IADD neg_k, RZ, -param_k;
+--:-:-:-:1      XMAD.S16.U16  idx_k, neg_k, idx_Q2, idx_Qk;
+
+// idx_K = idx_K * blk_k + idx_k
+04:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// idx_P = idx_P2 * 2
+// idx_Q = idx_Q2
+// if idx_P2 != gridP2:
+//     idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1)
+//     idx_Q  = idx_Q2 >> 1
+--:-:-:-:1      ISETP.NE.AND P1, PT, idx_P2, param_P2, PT;
+--:-:-:-:1      SHL idx_P, idx_P2, 1;
+--:-:-:-:1  @P1 LOP.AND q1, idx_Q2, 1;
+--:-:-:-:1  @P1 BFE.U32 q2, idx_Q2, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P1 LOP.XOR q1, q1, q2;
+--:-:-:-:1  @P1 IADD idx_P, idx_P, q1;
+--:-:-:-:1  @P1 SHR.U32 idx_Q, idx_Q2, 1;
+--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2;
+
+// Scan backwards on odd rows
+// if idx_P2 & 1:
+//     idx_Q = Q - idx_Q - 1
+--:-:-:-:1      LOP.AND.NZ P2, RZ, idx_P2, 1;
+--:-:-:-:1      MOV negOne, -1;
+--:-:-:-:1  @P2 IADD3 idx_Q, -idx_Q, param_Q, negOne;
+
+EOF
+    }
+
+    sub load_zeros
+    {
+        return  "--:-:-:-:1      STS.128 [addr_zero], RZ;\n" .
+                join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+    }
+
+    sub begin_lut
+    {
+        return <<'EOF';
+--:-:-:-:5  @P0 BRA.U END_SETUP;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      STS.128 [addr_mpqk], mpqk;
+
+--:-:-:-:1      MOV rst,        tid;
+--:-:-:-:1      MOV lutStore2,  RZ;
+--:-:-:-:1      MOV lutSize,    RZ;
+--:-:-:-:1      MOV warp_count, 32;
+
+--:-:-:-:1      IADD    mask_shr, -tid, 32;
+--:-:-:-:1      SHR.U32 dep_thd_mask, negOne, mask_shr;
+EOF
+    }
+
+    sub end_lut
+    {
+        return sprintf <<'EOF', $addr_shift;
+<ORDERED>
+// Get a mask of all valid slices in the warp
+--:-:-:-:1      VOTE.ANY ballot, PT, P1;
+// Count the total valid slices
+--:-:2:-:1      POPC warp_slices, ballot;
+// Prepare lutStore for this and next loop
+--:-:-:-:1  @P1 MOV    lutStore, lutStore2;
+02:-:-:-:1      ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+--:-:-:-:1  @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+--:-:3:-:1  @P1 POPC dep_thd_cnt, dep_thd_bits;
+// use the rst increment to space the barrier sync
+--:-:-:-:1      IADD rst, rst, 32;
+// Update the lutStore address from this count
+04:-:-:-:1  @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+--:1:-:-:1  @P1 STS.64 [lutStore + addr_lut], sliceIF;
+</ORDERED>
+// Keep track of the total size of the lut
+--:-:-:-:1      IADD lutSize, lutSize, warp_slices;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+--:1:-:-:2      STS [addr_szLut], lutSize;
+
+END_SETUP:
+
+01:-:-:-:5      BAR.SYNC 0;
+
+// Grab the caclulated lut size and get it's reciprical
+// Get the total reduction depth
+--:-:1:-:2      LDS lutSize, [addr_szLut];
+01:-:-:-:0      XMAD endCRST, lutSize, param_C, RZ;
+--:-:1:-:2      I2F.F32.S32 lutSizeRcp, lutSize;
+01:-:1:-:1      MUFU.RCP lutSizeRcp, lutSizeRcp;
+
+<SCHEDULE_BLOCK>
+// lutSize != 0
+--:-:-:-:1      LOP.AND.NZ P0, RZ, lutSize, -1;
+// posCRST = endCRST - tidY - 1
+--:-:-:-:1      IADD3 posCRST, endCRST, -1, -tidY;
+// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 8 then make a full 8 line fetch.
+--:-:-:-:1      LOP.AND.Z P1, partial, endCRST, 7;
+--:-:-:-:1  @P1 MOV partial, 8;
+// channel = posCRST / lutSize
+// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
+--:-:2:-:1      I2F.F32.S32 posCRSTf, posCRST;
+03:-:-:-:1      FMUL channel, posCRSTf, lutSizeRcp;
+--:-:-:-:1      FFMA channel, channel, 5.9604644775390625e-08, channel;
+--:-:2:-:1      F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+02:-:-:-:1      VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;
+--:-:-:-:1      SHL lutOffset, lutOffset, 3;
+// P1 = tidY < partial &&
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, partial, P0;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+--:-:-:-:1      XMAD.LO2C offsetIc, channel, param_DHWN, RZ;
+--:-:-:-:1      XMAD      offsetFc, channel, param_KRST, RZ;
+// posCRST -= partial
+--:-:-:-:1      IADD posCRST, posCRST, -partial;
+--:-:1:-:2  @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];
+</SCHEDULE_BLOCK>
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+// trackF = offsetFK + offsetFC + sliceF + param_F
+01:-:-:-:1  @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;
+--:-:-:-:5  @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;
+--:-:-:-:6  @P1 LEA      trackF0.CC, offsetF, param_F[0],     %1$s;
+--:-:-:-:1  @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, %1$s;
+--:-:-:-:6  @P1 LEA      trackI0.CC, offsetI, param_I[0],     %1$s;
+--:-:-:-:0  @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, %1$s;
+EOF
+    }
+
+    sub fprop_lut
+    {
+        return begin_lut() . <<'EOF' . end_lut();
+// mt = m * w - pad_d
+// pr = p * u - pad_h
+// qs = q * v - pad_w
+--:-:-:-:1      XMAD qs, q,   param_str_w, RZ;
+--:-:-:-:1      XMAD pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD mt, m,   param_str_d, RZ;
+--:-:-:-:1      IADD qs, qs, -param_pad_w;
+--:-:-:-:1      IADD pr, pr, -param_pad_h;
+--:-:-:-:1      IADD mt, mt, -param_pad_d;
+</SCHEDULE_BLOCK>
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_count < RST
+--:-:-:-:1      ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, rst, param_RST, PT;
+
+--:-:-:-:1      IADD warp_count, warp_count, 32;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = qs + (s * dil_w)
+// y = pr + (r * dil_h)
+// z = mt + (t * dil_d)
+--:-:-:-:1      XMAD x, s, param_dil_w, qs;
+--:-:-:-:1      XMAD y, r, param_dil_h, pr;
+--:-:-:-:1      XMAD z, t, param_dil_d, mt;
+--:-:-:-:1      ISETP.GE.AND  P4, PT, x, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P5, PT, y, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P6, PT, z, RZ, P6;
+--:-:-:-:1      ISETP.LT.AND  P4, PT, x, param_W, P4;
+--:-:-:-:1      ISETP.LT.AND  P5, PT, y, param_H, P5;
+--:-:-:-:1      ISETP.LT.AND  P6, PT, z, param_D, P6;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P4, P5, P6;
+
+<ORDERED>
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD      sliceI, x, param_N,   RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, sliceI;
+// sliceF = rst * K
+--:-:-:-:1      XMAD sliceF, rst, param_K, RZ;
+</ORDERED>
+EOF
+    }
+
+    sub bprop_lut
+    {
+        return begin_lut() . <<'EOF' . end_lut();
+--:-:-:-:1      MOV str_d, param_str_d;
+--:-:-:-:1      MOV str_h, param_str_h;
+--:-:-:-:1      MOV str_w, param_str_w;
+// qs = q - pad_w
+// pr = p - pad_h
+// mt = m - pad_d
+--:-:-:-:1      IADD qs, q, -param_pad_w;
+--:-:-:-:1      IADD pr, p, -param_pad_h;
+--:-:-:-:1      IADD mt, m, -param_pad_d;
+</SCHEDULE_BLOCK>
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_count < RST
+--:-:-:-:1      ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, rst, param_RST, PT;
+--:-:-:-:1      IADD warp_count, warp_count, 32;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = qs + (s * dil_w)
+// y = pr + (r * dil_h)
+// z = mt + (t * dil_d)
+--:-:-:-:1      XMAD x, s, param_dil_w, qs;
+--:-:-:-:1      XMAD y, r, param_dil_h, pr;
+--:-:-:-:1      XMAD z, t, param_dil_d, mt;
+--:-:-:-:1      ISETP.GE.AND  P4, PT, x, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P5, PT, y, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P6, PT, z, RZ, P6;
+// x_prime = x / str_w
+// x       = x % str_w
+--:-:-:-:1      XMAD    x_prime, x, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x_prime, x_prime, param_shift_str_w;
+--:-:-:-:1      VMAD.U16.U16 x, -x_prime, str_w, x;
+// y_prime = y / str_h
+// y       = y % str_h
+--:-:-:-:1      XMAD    y_prime, y, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y_prime, y_prime, param_shift_str_h;
+--:-:-:-:1      VMAD.U16.U16 y, -y_prime, str_h, y;
+// z_prime = z / str_d
+// z       = z % str_d
+--:-:-:-:1      XMAD    z_prime, z, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z_prime, z_prime, param_shift_str_d;
+--:-:-:-:1      VMAD.U16.U16 z, -z_prime, str_d, z;
+
+--:-:-:-:1      ISETP.EQ.AND  P4, PT, x, RZ, P4;
+--:-:-:-:1      ISETP.EQ.AND  P5, PT, y, RZ, P5;
+--:-:-:-:1      ISETP.EQ.AND  P6, PT, z, RZ, P6;
+--:-:-:-:1      ISETP.LT.AND  P4, PT, x_prime, param_W, P4;
+--:-:-:-:1      ISETP.LT.AND  P5, PT, y_prime, param_H, P5;
+--:-:-:-:1      ISETP.LT.AND  P6, PT, z_prime, param_D, P6;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P4, P5, P6;
+
+// sliceI = z_prime*HWN + y_prime*WN + x_prime*N
+01:-:-:-:1      XMAD      sliceI, x_prime, param_N,   RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y_prime, param_WN,  sliceI;
+--:-:-:-:1      XMAD.LO2C sliceI, z_prime, param_HWN, sliceI;
+// sliceF = rst_prime * K
+01:-:-:-:1      XMAD sliceF, rst, param_K, RZ;
+EOF
+    }
+
+    sub load_lut
+    {
+        return $prop eq 'f' ? fprop_lut() : bprop_lut();
+    }
+
+    sub loop_setup
+    {
+        my $swap;
+        if ($shareI == $shareF)
+        {
+            $swap = <<'EOF';
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;
+EOF
+        }
+        else
+        {
+            $swap = <<'EOF';
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ,     -swapBuf;
+EOF
+        }
+        return sprintf <<'EOF', $shareI, $shareF, $stepI, $stepF, $addr_shift, $swap;
+
+--:-:-:-:0      ISETP.GE.AND P1, PT, posCRST, RZ, PT;
+--:-:2:-:1      I2F.F32.S32 posCRSTf, posCRST;
+
+01:-:-:-:5      BAR.SYNC 0;
+%6$s
+
+--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*%1$-3s + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*%2$-3s + 00>];
+--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*%1$-3s + %3$s>];
+--:-:1:-:2      LDS.U.128 j0Fy4, [readFs + 4x<0*%2$-3s + %4$s>];
+
+<SCHEDULE_BLOCK>
+// channel = posCRST / lutSize
+02:-:-:-:1  @P1 FMUL channel, posCRSTf, lutSizeRcp;
+--:-:-:-:1  @P1 FFMA channel, channel, 5.9604644775390625e-08, channel;
+--:-:2:-:1  @P1 F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+02:-:-:-:1  @P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;
+--:-:-:-:1  @P1 SHL lutOffset, lutOffset, 3;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+--:-:-:-:1  @P1 XMAD.LO2C offsetIc, channel, param_DHWN, RZ;
+--:-:-:-:1  @P1 XMAD      offsetFc, channel, param_KRST, RZ;
+
+--:-:-:-:1      IADD posCRST, posCRST, -8;
+--:-:2:-:2  @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];
+</SCHEDULE_BLOCK>
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+// trackF = offsetFK + offsetFC + sliceF + param_F
+02:-:-:-:1  @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;
+--:-:-:-:5  @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;
+--:-:-:-:6  @P1 LEA      trackF0.CC, offsetF, param_F[0],     %5$s;
+--:-:-:-:1  @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, %5$s;
+--:-:-:-:6  @P1 LEA      trackI0.CC, offsetI, param_I[0],     %5$s;
+--:-:-:-:0  @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, %5$s;
+EOF
+    }
+
+    sub main_loop
+    {
+        our %insert;
+        my @cOrder;
+        my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+        my @y = (0,1,4,5);
+        foreach my $x (0,2,4,6)
+        {
+            foreach my $y (@y)
+            {
+                push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+            }
+            @y = reverse @y;
+        }
+        my $out;
+        foreach my $j (0 .. 7)
+        {
+            my $odd      = $j & 1;
+            my $nOdd     = !$odd + 0;
+            my $rsOffset = ($j + 1) % 8;
+            my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+            $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareI;
+            $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareF;
+            $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareI, $stepI;
+            $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareF, $stepF;
+
+            foreach my $c (0 .. 63)
+            {
+                my ($x,$y) = @{$cOrder[$c]};
+
+                my $ins    = $insert{"j${j}c$c"} || '';
+
+                my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+                my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+                my $wait   = $c == 0 ? '01' : '--';
+
+                my $ctrl   = "$wait:-:-:$yield:$stall";
+
+                $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+            }
+        }
+        return $out;
+    }
+
+
+    sub output_setup
+    {
+        my ($tidOX, $warp_shift, $bsum_shift) = @_;
+        my $out;
+
+        $out .= qq{
+02:-:-:-:1      SHR.U32   bsum_offset, tidOX, $bsum_shift;
+04:-:-:-:1      ISCADD    bsum_offset, idx_N, bsum_offset,   $warp_shift;
+01:-:-:-:1      XMAD      bsum_offset, idx_Q, param_gridN,   bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_P, param_gridQN,  bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_M, param_gridPQN, bsum_offset;
+
+--:-:-:-:1      LOP.AND.Z P5, RZ, tidOX, $tidOX;
+        } if $bsum;
+
+        $out .= qq{
+// out_offset = m*PQN + p*QN + q*N + n
+01:-:-:-:1      XMAD      out_offset, q, param_N,    n;
+--:-:-:-:1      XMAD.LO2C out_offset, p, param_QN,   out_offset;
+--:-:-:-:1      XMAD.LO2C out_offset, m, param_PQN,  out_offset;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV32I one, 1.0;
+
+--:-:-:-:1      ISETP.EQ.AND P2, PT, RZ, param_flags, PT; // no output
+--:-:-:-:1      ISETP.LT.AND P0, PT, n, param_N, P2;
+        };
+
+        $out .=  $half ? q{
+--:-:-:-:1      ISETP.LT.AND P1, PT, n, param_N, P2;
+        } : qq{
+--:-:-:-:1      IADD n, n, $stepI;
+--:-:-:-:1      ISETP.LT.AND P1, PT, n, param_N, P2;
+        };
+        return $out;
+    }
+
+    sub output
+    {
+        my $out = q{
+--:-:-:-:5      BAR.SYNC 0;
+        };
+
+        foreach my $y (0..7)
+        {
+            my $incK  = $y == 4 && !$remapF ? $stepF-3 : 1;
+            my $stepK = $y ? "\n--:-:-:-:1      IADD k, k, $incK;" : "";
+
+            $out .= qq{$stepK
+--:-:-:-:1      FMUL cs0, cx0y$y, alpha;
+--:-:-:-:1      FMUL cs1, cx1y$y, alpha;
+--:-:-:-:1      FMUL cs2, cx2y$y, alpha;
+--:-:-:-:1      FMUL cs3, cx3y$y, alpha;
+--:-:-:-:1      FMUL cs4, cx4y$y, alpha;
+--:-:-:-:1      FMUL cs5, cx5y$y, alpha;
+--:-:-:-:1      FMUL cs6, cx6y$y, alpha;
+--:-:-:-:0      FMUL cs7, cx7y$y, alpha;
+--:-:-:-:5      CAL STORE_O;
+            };
+        }
+        $out .= q{
+
+--:-:-:-:5      EXIT;
+
+STORE_O:
+
+<SCHEDULE_BLOCK>
+30:-:-:-:1      XMAD offset, k, param_MPQN, out_offset;
+--:-:-:-:1      XMAD.PSL offset, k, param_MPQN.H1, offset;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n < N
+--:-:-:-:1      ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n < N
+        };
+
+        if ($beta || $brelu || $bprelu)
+        {
+            $out .= qq{
+--:-:-:-:1      LEA      Out0.CC, offset, param_X[0],     $addr_shift;
+--:-:-:-:1      LEA.HI.X Out1,    offset, param_X[1], RZ, $addr_shift;
+            };
+            $out .= $half ? q{
+--:-:5:-:2  @P2 LDG.E.128 b0, [Out];
+            } : q{
+--:-:5:-:1  @P2 LDG.E.128 b0, [Out + 4x<00>];
+--:-:6:-:1  @P3 LDG.E.128 b4, [Out + 4x<$stepI>];
+            };
+        }
+
+        $out .= q{
+--:-:-:-:1      LEA      Sum0.CC, k, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    k, param_Sum[1], RZ, 2;
+
+--:-:6:-:1  @P2 LDG.E.CI b0, [Sum];
+--:-:-:-:1 @!P2 MOV b0, RZ;
+        } if $bias;
+
+        $out .= q{
+<ORDERED>
+--:-:-:-:1      STS.128 [writeCs + 4x<00>], cs0;
+--:-:-:-:1      STS.128 [writeCs + 4x<$remapI ? 4 : $stepI>], cs4;
+--:-:1:-:1  @P2 LDS.U.128 out0, [readCs + 4x<00>];
+--:-:2:-:1  @P3 LDS.U.128 out4, [readCs + 4x<$half ? 4 : $stepI>];
+</ORDERED>
+</SCHEDULE_BLOCK>
+<SCHEDULE_BLOCK>
+        };
+
+        $out .= q{
+21:-:-:-:1      FADD out0, out0, b0;
+--:-:-:-:1      FADD out1, out1, b0;
+--:-:-:-:1      FADD out2, out2, b0;
+--:-:-:-:1      FADD out3, out3, b0;
+02:-:-:-:1      FADD out4, out4, b0;
+--:-:-:-:1      FADD out5, out5, b0;
+--:-:-:-:1      FADD out6, out6, b0;
+--:-:-:-:1      FADD out7, out7, b0;
+        } if $bias;
+
+        $out .= q{
+01:-:-:-:1      FMNMX out0, out0, RZ, !PT;
+--:-:-:-:1      FMNMX out1, out1, RZ, !PT;
+--:-:-:-:1      FMNMX out2, out2, RZ, !PT;
+--:-:-:-:1      FMNMX out3, out3, RZ, !PT;
+02:-:-:-:1      FMNMX out4, out4, RZ, !PT;
+--:-:-:-:1      FMNMX out5, out5, RZ, !PT;
+--:-:-:-:1      FMNMX out6, out6, RZ, !PT;
+--:-:-:-:1      FMNMX out7, out7, RZ, !PT;
+        } if $relu;
+
+        $out .= q{
+// maximum(x, 0) + slope * minimum(0, x)
+01:-:-:-:1      FMNMX b0, out0, RZ, !PT;
+--:-:-:-:1      FMNMX b1, out1, RZ, !PT;
+--:-:-:-:1      FMNMX b2, out2, RZ, !PT;
+--:-:-:-:1      FMNMX b3, out3, RZ, !PT;
+02:-:-:-:1      FMNMX b4, out4, RZ, !PT;
+--:-:-:-:1      FMNMX b5, out5, RZ, !PT;
+--:-:-:-:1      FMNMX b6, out6, RZ, !PT;
+--:-:-:-:1      FMNMX b7, out7, RZ, !PT;
+
+--:-:-:-:1      FMNMX x0, out0, RZ, PT;
+--:-:-:-:1      FMNMX x1, out1, RZ, PT;
+--:-:-:-:1      FMNMX x2, out2, RZ, PT;
+--:-:-:-:1      FMNMX x3, out3, RZ, PT;
+--:-:-:-:1      FMNMX x4, out4, RZ, PT;
+--:-:-:-:1      FMNMX x5, out5, RZ, PT;
+--:-:-:-:1      FMNMX x6, out6, RZ, PT;
+--:-:-:-:1      FMNMX x7, out7, RZ, PT;
+
+--:-:-:-:1      FFMA out0, x0, param_beta, b0;
+--:-:-:-:1      FFMA out1, x1, param_beta, b1;
+--:-:-:-:1      FFMA out2, x2, param_beta, b2;
+--:-:-:-:1      FFMA out3, x3, param_beta, b3;
+--:-:-:-:1      FFMA out4, x4, param_beta, b4;
+--:-:-:-:1      FFMA out5, x5, param_beta, b5;
+--:-:-:-:1      FFMA out6, x6, param_beta, b6;
+--:-:-:-:1      FFMA out7, x7, param_beta, b7;
+        } if $prelu;
+
+        $out .= q{
+</SCHEDULE_BLOCK>
+        };
+
+        $out .= q{
+13:-:-:-:1  @P2 F2F.F32.F16 b7, b3.H1;
+--:-:-:-:1  @P2 F2F.F32.F16 b6, b3.H0;
+--:-:-:-:1  @P2 F2F.F32.F16 b5, b2.H1;
+--:-:-:-:1  @P2 F2F.F32.F16 b4, b2.H0;
+--:-:-:-:1  @P2 F2F.F32.F16 b3, b1.H1;
+--:-:-:-:1  @P2 F2F.F32.F16 b2, b1.H0;
+--:-:-:-:1  @P2 F2F.F32.F16 b1, b0.H1;
+--:-:5:-:2  @P2 F2F.F32.F16 b0, b0.H0;
+        } if $half && ($beta || $brelu || $bprelu);
+
+        $out .= q{
+<SCHEDULE_BLOCK>
+        };
+
+        $out .= q{
+11:-:-:-:1  @P2 FFMA out0, b0, param_beta, out0;
+--:-:-:-:1  @P2 FFMA out1, b1, param_beta, out1;
+--:-:-:-:1  @P2 FFMA out2, b2, param_beta, out2;
+--:-:-:-:1  @P2 FFMA out3, b3, param_beta, out3;
+22:-:-:-:1  @P3 FFMA out4, b4, param_beta, out4;
+--:-:-:-:1  @P3 FFMA out5, b5, param_beta, out5;
+--:-:-:-:1  @P3 FFMA out6, b6, param_beta, out6;
+--:-:-:-:1  @P3 FFMA out7, b7, param_beta, out7;
+        } if $beta;
+
+        $out .= q{
+//delta *= (x > 0)
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+11:-:-:-:1      FSETP.GT.AND P0, PT, b0, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b1, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b2, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b3, RZ, PT;
+--:-:-:-:1 @!P0 MOV out0, RZ;
+--:-:-:-:1 @!P1 MOV out1, RZ;
+--:-:-:-:1 @!P2 MOV out2, RZ;
+--:-:-:-:1 @!P3 MOV out3, RZ;
+22:-:-:-:1      FSETP.GT.AND P0, PT, b4, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b5, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b6, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b7, RZ, PT;
+--:-:-:-:1 @!P0 MOV out4, RZ;
+--:-:-:-:1 @!P1 MOV out5, RZ;
+--:-:-:-:1 @!P2 MOV out6, RZ;
+--:-:-:-:1 @!P3 MOV out7, RZ;
+--:-:-:-:5      R2P PR, preds, 0x0f;
+        } if $brelu;
+
+        $out .= q{
+//delta *= ((x > 0) + slope * (x < 0))
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+11:-:-:-:1      FSETP.GT.AND P0, PT, b0, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b1, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b2, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b3, RZ, PT;
+--:-:-:-:1      SEL x0, one, RZ, P0;
+--:-:-:-:1      SEL x1, one, RZ, P1;
+--:-:-:-:1      SEL x2, one, RZ, P2;
+--:-:-:-:1      SEL x3, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b0, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b1, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b2, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b3, RZ, PT;
+--:-:-:-:1      SEL b0, one, RZ, P0;
+--:-:-:-:1      SEL b1, one, RZ, P1;
+--:-:-:-:1      SEL b2, one, RZ, P2;
+--:-:-:-:1      SEL b3, one, RZ, P3;
+--:-:-:-:1      FFMA b0, b0, param_beta, x0;
+--:-:-:-:1      FFMA b1, b1, param_beta, x1;
+--:-:-:-:1      FFMA b2, b2, param_beta, x2;
+--:-:-:-:1      FFMA b3, b3, param_beta, x3;
+--:-:-:-:1      FMUL out0, out0, b0;
+--:-:-:-:1      FMUL out1, out1, b1;
+--:-:-:-:1      FMUL out2, out2, b2;
+--:-:-:-:1      FMUL out3, out3, b3;
+22:-:-:-:1      FSETP.GT.AND P0, PT, b4, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b5, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b6, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b7, RZ, PT;
+--:-:-:-:1      SEL x4, one, RZ, P0;
+--:-:-:-:1      SEL x5, one, RZ, P1;
+--:-:-:-:1      SEL x6, one, RZ, P2;
+--:-:-:-:1      SEL x7, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b4, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b5, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b6, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b7, RZ, PT;
+--:-:-:-:1      SEL b4, one, RZ, P0;
+--:-:-:-:1      SEL b5, one, RZ, P1;
+--:-:-:-:1      SEL b6, one, RZ, P2;
+--:-:-:-:1      SEL b7, one, RZ, P3;
+--:-:-:-:1      R2P PR, preds, 0x0f;
+--:-:-:-:1      FFMA b4, b4, param_beta, x4;
+--:-:-:-:1      FFMA b5, b5, param_beta, x5;
+--:-:-:-:1      FFMA b6, b6, param_beta, x6;
+--:-:-:-:1      FFMA b7, b7, param_beta, x7;
+--:-:-:-:1      FMUL out4, out4, b4;
+--:-:-:-:1      FMUL out5, out5, b5;
+--:-:-:-:1      FMUL out6, out6, b6;
+--:-:-:-:1      FMUL out7, out7, b7;
+        } if $bprelu;
+
+        $out .= q{
+--:-:-:-:1 @!P2 MOV  sum0, RZ;
+--:-:-:-:1 @!P3 MOV  sum2, RZ;
+01:-:-:-:1  @P2 FADD sum0, out0, out1;
+--:-:-:-:1  @P2 FADD sum1, out2, out3;
+02:-:-:-:1  @P3 FADD sum2, out4, out5;
+--:-:-:-:1  @P3 FADD sum3, out6, out7;
+--:-:-:-:1  @P2 FADD sum0, sum0, sum1;
+--:-:-:-:1  @P3 FADD sum2, sum2, sum3;
+--:-:-:-:1      FADD sum0, sum0, sum2;
+        } if $bsum;
+
+        $out .= q{
+<ORDERED>
+01:-:-:-:1  @P2 F2F.F16.F32 out0, out0;
+--:-:-:-:1  @P2 F2F.F16.F32 out1, out1;
+--:-:-:-:1  @P2 F2F.F16.F32 out2, out2;
+--:-:1:-:1  @P2 F2F.F16.F32 out3, out3;
+02:-:-:-:1  @P2 F2F.F16.F32 out4, out4;
+--:-:-:-:1  @P2 F2F.F16.F32 out5, out5;
+--:-:-:-:1  @P2 F2F.F16.F32 out6, out6;
+--:-:2:-:1  @P2 F2F.F16.F32 out7, out7;
+</ORDERED>
+        } if $half;
+
+        $out .= q{
+</SCHEDULE_BLOCK>
+        };
+
+        $out .= $half ? qq{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out0.CC, offset, param_O[0],     $addr_shift;
+--:-:-:-:1      LEA.HI.X Out1,    offset, param_O[1], RZ, $addr_shift;
+
+01:-:-:-:1  \@P2 BFI c0, out1, 0x1010, out0;
+--:-:-:-:1  \@P2 BFI c1, out3, 0x1010, out2;
+02:-:-:-:1  \@P2 BFI c2, out5, 0x1010, out4;
+--:-:-:-:1  \@P2 BFI c3, out7, 0x1010, out6;
+
+--:5:-:-:1  \@P2 STG.E.CG.128 [Out], c0;
+</SCHEDULE_BLOCK>
+        } : qq{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out0.CC, offset, param_O[0],     $addr_shift;
+--:-:-:-:1      LEA.HI.X Out1,    offset, param_O[1], RZ, $addr_shift;
+
+01:-:-:-:1  \@P2 STG.E.CG.128 [Out + 4x<00>], out0;
+02:5:-:-:1  \@P3 STG.E.CG.128 [Out + 4x<$stepI>], out4;
+</SCHEDULE_BLOCK>
+        };
+
+        $out .= q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      XMAD.LO2C offset, k, param_gridMPQN, bsum_offset;
+--:-:-:-:1      LEA      Sum0.CC, offset, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    offset, param_Sum[1], RZ, 2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, k, param_K, P5; // k < K && tid31 == 0
+
+--:-:2:-:2      SHFL.BFLY PT, sum1, sum0, 1,  0x1f;
+02:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:2:-:2      SHFL.BFLY PT, sum1, sum0, 2,  0x1f;
+02:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:2:-:2      SHFL.BFLY PT, sum1, sum0, 4,  0x1f;
+02:-:-:-:2      FADD sum0, sum1, sum0;
+
+--:6:-:-:1  @P6 STG.E.CG [Sum], sum0;
+</SCHEDULE_BLOCK>
+        } if $bsum;
+
+        $out .= q{
+--:-:-:-:5      RET;
+        };
+    }
+
+-]
diff --git a/Kernel/Convolution/Pascal/hconv_bprop_C1_N64.sass b/Kernel/Convolution/Pascal/hconv_bprop_C1_N64.sass
new file mode 100644
index 0000000..fb00d82
--- /dev/null
+++ b/Kernel/Convolution/Pascal/hconv_bprop_C1_N64.sass
@@ -0,0 +1,663 @@
+# Kernel: hconv_bprop_C32_N64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert { return $convert; }
+
+    our $dtype = $int16 ? 'S16' : 'U16';
+    sub dtype { return $dtype; }
+-]
+
+<CONSTANT_MAPPING>
+    addr_lut : 4x<64*4>
+
+    param_I[0]         : c[0x0][0x140]
+    param_I[1]         : c[0x0][0x144]
+    param_E[0]         : c[0x0][0x148]
+    param_E[1]         : c[0x0][0x14c]
+    param_F[0]         : c[0x0][0x150]
+    param_F[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_N            : c[0x0][0x15c]
+    param_K            : c[0x0][0x160]
+    param_D            : c[0x0][0x164]
+    param_H            : c[0x0][0x168]
+    param_W            : c[0x0][0x16c]
+    param_WN           : c[0x0][0x170]
+    param_HWN          : c[0x0][0x174]
+    param_DHWN         : c[0x0][0x178]
+    param_C            : c[0x0][0x17c]
+    param_CRST         : c[0x0][0x180]
+    param_RST          : c[0x0][0x184]
+    param_magic_RST    : c[0x0][0x188]
+    param_shift_RST    : c[0x0][0x18c]
+    param_RS           : c[0x0][0x190]
+    param_magic_RS     : c[0x0][0x194]
+    param_shift_RS     : c[0x0][0x198]
+    param_S            : c[0x0][0x19c]
+    param_magic_S      : c[0x0][0x1a0]
+    param_shift_S      : c[0x0][0x1a4]
+    param_pad_d        : c[0x0][0x1a8]
+    param_pad_h        : c[0x0][0x1ac]
+    param_pad_w        : c[0x0][0x1b0]
+    param_str_d        : c[0x0][0x1b4]
+    param_str_h        : c[0x0][0x1b8]
+    param_str_w        : c[0x0][0x1bc]
+    param_Q            : c[0x0][0x1c0]
+    param_PQ           : c[0x0][0x1c4]
+    param_QN           : c[0x0][0x1c8]
+    param_PQN          : c[0x0][0x1cc]
+    param_MPQN         : c[0x0][0x1d0]
+    param_magic_Q      : c[0x0][0x1d4]
+    param_shift_Q      : c[0x0][0x1d8]
+    param_magic_PQ     : c[0x0][0x1dc]
+    param_shift_PQ     : c[0x0][0x1e0]
+    param_CRST8        : c[0x0][0x1e4]
+    param_MPQN8        : c[0x0][0x1e8]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-67 ~ tid, blkE, blkF, blkMPQ
+
+     68-119 ~ k<0|4>, tidFX, tidEX, tid1, tid7, m, p, q, crst, n, tf<0|4>, te, te<0|4>, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+      64-79 : j0Ex<0-7>, j0Fy<0-7>
+      80-95 : j1Ex<0-7>, j1Fy<0-7>
+
+     96-103 : load0F<0-3>, load4F<0-3>
+     96-103 : store0F<0-3>, store4F<0-3>
+
+    104-107 : load0E<0-3>
+    104-107 : store0E<0-3>
+    112-115 : store0E<4-7>
+
+    108-111 : load4E<0-3>
+    108-111 : store4E<0-3>
+    112-115 : store4E<4-7>
+
+    116-119 : track0F<0-1>, track4F<0-1>
+    120-123 : track0E<0-1>, track4E<0-1>
+
+    124-127 ~ writeEs, writeFs, swapBuf, K
+    128-132 ~ readEs, readFs, mt, pr, qs
+
+     68-71  ~ lutStore, sliceI
+     72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD
+
+     72-93  : c<0-7>, cs<0-3>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1>
+     94-127 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,    SR_TID.X;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkF,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+// tidFX  = (tid & 7) << 2
+// tidEX  = (tid & 7) << 3
+// k      = tid >> 3
+01:-:-:-:1      LOP.AND tid7,  tid,  7;
+--:-:-:-:1      SHL     tidFX, tid7, 2;
+--:-:-:-:1      SHL     tidEX, tid7, 3;
+--:-:-:-:1      SHR.U32 k0,    tid,  3;
+--:-:-:-:1      IADD    k4,    k0,   4;
+
+--:-:-:-:1      MOV K, param_K;
+
+--:-:-:-:1      STS.128 [RZ], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [RZ];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+
+// mt = m * w - pad_d
+// pr = p * u - pad_h
+// qs = q * v - pad_w
+--:-:-:-:1      XMAD mt, m,   param_str_d, RZ;
+--:-:-:-:1      XMAD pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD qs, q,   param_str_w, RZ;
+--:-:-:-:1      IADD mt, mt, -param_pad_d;
+--:-:-:-:1      IADD pr, pr, -param_pad_h;
+--:-:-:-:1      IADD qs, qs, -param_pad_w;
+
+// crst = blkF*32 + tidX
+// n    = blkE*64 + tidX
+04:-:-:-:1      ISCADD crst, blkF, tidFX, 5;
+08:-:-:-:1      ISCADD n,    blkE, tidEX, 6;
+
+// trackF = k*CRST + crst
+--:-:-:-:1      XMAD     tf0, k0, param_CRST, crst;
+--:-:-:-:1      XMAD     tf4, k4, param_CRST, crst;
+--:-:-:-:1      LEA      track0F0.CC, tf0, param_F[0],     1;
+--:-:-:-:1      LEA.HI.X track0F1,    tf0, param_F[1], RZ, 1;
+--:-:-:-:1      LEA      track4F0.CC, tf4, param_F[0],     1;
+--:-:-:-:1      LEA.HI.X track4F1,    tf4, param_F[1], RZ, 1;
+
+// trackE = k*MPQN + m*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD      te,  q,  param_N,    n;
+--:-:-:-:1      XMAD.LO2C te,  p,  param_QN,   te;
+--:-:-:-:1      XMAD.LO2C te,  m,  param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te0, k0, param_MPQN, te;
+--:-:-:-:1      XMAD.LO2C te4, k4, param_MPQN, te;
+--:-:-:-:1      LEA       track0E0.CC, te0, param_E[0],     1;
+--:-:-:-:1      LEA.HI.X  track0E1,    te0, param_E[1], RZ, 1;
+--:-:-:-:1      LEA       track4E0.CC, te4, param_E[0],     1;
+--:-:-:-:1      LEA.HI.X  track4E1,    te4, param_E[1], RZ, 1;
+
+// P1 = crst < CRST
+// P2 = n    < N
+// P3 = n+32 < N
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst, param_CRST, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, n,    param_N,    PT;
+
+// Remap the EX dim to avoid bank conflicts when storing to shared
+// We can unmap this in the output
+
+// writeFs = (32*k + tidFX) * 4
+--:-:-:-:1      ISCADD  writeFs, k0, tidFX, 5;
+--:-:-:-:1      SHL     writeFs, writeFs,   2;
+// writeEs = (64*k + tidFX) * 4 (tidFX here not a bug)
+--:-:-:-:1      ISCADD  writeEs, k0, tidFX, 6;
+--:-:-:-:1      ISCADD  writeEs, writeEs, 4x<32*8>, 2;
+
+// readFs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readEs = ((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<32*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<32*8 + 64*8>;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD K, K, -8;
+
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F0, [track0F + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F1, [track0F + 2x<1>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F2, [track0F + 2x<2>];
+--:-:1:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F3, [track0F + 2x<3>];
+
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F0, [track4F + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F1, [track4F + 2x<1>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F2, [track4F + 2x<2>];
+--:-:2:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F3, [track4F + 2x<3>];
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, K, RZ, P1;
+
+--:-:3:-:1  @P2 LDG.E.128 load0E0, [track0E];
+--:-:4:-:1  @P2 LDG.E.128 load4E0, [track4E];
+
+--:-:-:-:0      ISETP.GT.AND P2, PT, K, RZ, P2;
+
+01:-:-:-:1      [+ convert() +] store0F0, load0F0;
+--:-:-:-:1      [+ convert() +] store0F1, load0F1;
+--:-:-:-:1      [+ convert() +] store0F2, load0F2;
+--:-:1:-:1      [+ convert() +] store0F3, load0F3;
+--:-:-:-:6      IADD   track0F0.CC, track0F0, param_CRST8;
+--:-:-:-:0      IADD.X track0F1,    track0F1, RZ;
+01:-:-:-:1      STS.128 [writeFs + 4x<0*32>], store0F;
+
+02:-:-:-:1      [+ convert() +] store4F0, load4F0;
+--:-:-:-:1      [+ convert() +] store4F1, load4F1;
+--:-:-:-:1      [+ convert() +] store4F2, load4F2;
+--:-:2:-:1      [+ convert() +] store4F3, load4F3;
+--:-:-:-:6      IADD   track4F0.CC, track4F0, param_CRST8;
+--:-:-:-:0      IADD.X track4F1,    track4F1, RZ;
+02:-:-:-:1      STS.128 [writeFs + 4x<4*32>], store4F;
+
+04:-:-:-:1      [+ convert() +] store0E7, load0E3.H1;
+--:-:-:-:1      [+ convert() +] store0E6, load0E3.H0;
+--:-:-:-:1      [+ convert() +] store0E5, load0E2.H1;
+--:-:1:-:1      [+ convert() +] store0E4, load0E2.H0;
+--:-:-:-:1      [+ convert() +] store0E3, load0E1.H1;
+--:-:-:-:1      [+ convert() +] store0E2, load0E1.H0;
+--:-:-:-:1      [+ convert() +] store0E1, load0E0.H1;
+--:-:2:-:1      [+ convert() +] store0E0, load0E0.H0;
+--:-:-:-:6      IADD   track0E0.CC, track0E0, param_MPQN8;
+--:-:-:-:0      IADD.X track0E1,    track0E1, RZ;
+01:-:-:-:1      STS.128 [writeEs + 4x<0*64 + 32>], store0E4;
+02:1:-:-:2      STS.128 [writeEs + 4x<0*64 +  0>], store0E0;
+
+09:-:-:-:1      [+ convert() +] store4E7, load4E3.H1;
+--:-:-:-:1      [+ convert() +] store4E6, load4E3.H0;
+--:-:-:-:1      [+ convert() +] store4E5, load4E2.H1;
+--:-:1:-:1      [+ convert() +] store4E4, load4E2.H0;
+--:-:-:-:1      [+ convert() +] store4E3, load4E1.H1;
+--:-:-:-:1      [+ convert() +] store4E2, load4E1.H0;
+--:-:-:-:1      [+ convert() +] store4E1, load4E0.H1;
+--:-:2:-:1      [+ convert() +] store4E0, load4E0.H0;
+--:-:-:-:6      IADD   track4E0.CC, track4E0, param_MPQN8;
+--:-:-:-:0      IADD.X track4E1,    track4E1, RZ;
+01:-:-:-:1      STS.128 [writeEs + 4x<4*64 + 32>], store4E4;
+02:1:-:-:2      STS.128 [writeEs + 4x<4*64 +  0>], store4E0;
+
+
+01:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
+--:-:-:-:2      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD K, K, -8;
+
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*32 + 16>];
+
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F0, [track0F + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F1, [track0F + 2x<1>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F2, [track0F + 2x<2>];
+--:-:2:-:1  @P1 LDG.E.CI.[+ dtype() +] load0F3, [track0F + 2x<3>];
+
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F0, [track4F + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F1, [track4F + 2x<1>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F2, [track4F + 2x<2>];
+--:-:3:-:1  @P1 LDG.E.CI.[+ dtype() +] load4F3, [track4F + 2x<3>];
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, K, RZ, P1;
+
+--:-:4:-:1  @P2 LDG.E.128 load0E0, [track0E];
+--:-:5:-:1  @P2 LDG.E.128 load4E0, [track4E];
+
+--:-:-:-:2      ISETP.GT.AND P2, PT, K, RZ, P2;
+
+NEXT_8K:
+--:-:-:-:1      ISETP.GT.AND P0, PT, K, -8, PT;
+
+[+
+    our $convert;
+    our $dtype;
+    my %insert =
+    (
+        j0c8  => "--:-:-:-:1      IADD K, K, -8;\n",
+
+        j0c12 => "02:-:-:-:1  \@P0 $convert store0F0, load0F0;\n",
+        j0c16 => "--:-:-:-:1  \@P0 $convert store0F1, load0F1;\n",
+        j0c20 => "--:-:-:-:1  \@P0 $convert store0F2, load0F2;\n",
+        j0c24 => "--:-:2:-:1  \@P0 $convert store0F3, load0F3;\n",
+        j0c26 => "--:-:-:-:1  \@P0 IADD   track0F0.CC, track0F0, param_CRST8;\n",
+        j0c31 => "--:-:-:-:1  \@P0 IADD.X track0F1,    track0F1, RZ;\n",
+        j0c38 => "02:2:-:-:1  \@P0 STS.128 [writeFs + 4x<0*32>], store0F;\n",
+        j1c8  => "02:-:-:-:1  \@P1 LDG.E.CI.$dtype load0F0, [track0F + 2x<0>];\n",
+        j1c10 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype load0F1, [track0F + 2x<1>];\n",
+        j1c12 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype load0F2, [track0F + 2x<2>];\n",
+        j1c14 => "--:-:2:-:1  \@P1 LDG.E.CI.$dtype load0F3, [track0F + 2x<3>];\n",
+
+        j2c12 => "04:-:-:-:1  \@P0 $convert store4F0, load4F0;\n",
+        j2c16 => "--:-:-:-:1  \@P0 $convert store4F1, load4F1;\n",
+        j2c20 => "--:-:-:-:1  \@P0 $convert store4F2, load4F2;\n",
+        j2c24 => "--:-:3:-:1  \@P0 $convert store4F3, load4F3;\n",
+        j2c26 => "--:-:-:-:1  \@P0 IADD   track4F0.CC, track4F0, param_CRST8;\n",
+        j2c31 => "--:-:-:-:1  \@P0 IADD.X track4F1,    track4F1, RZ;\n",
+        j2c38 => "04:3:-:-:1  \@P0 STS.128 [writeFs + 4x<4*32>], store4F;\n",
+        j3c8  => "04:-:-:-:1  \@P1 LDG.E.CI.$dtype load4F0, [track4F + 2x<0>];\n",
+        j3c10 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype load4F1, [track4F + 2x<1>];\n",
+        j3c12 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype load4F2, [track4F + 2x<2>];\n",
+        j3c14 => "--:-:3:-:1  \@P1 LDG.E.CI.$dtype load4F3, [track4F + 2x<3>];\n",
+
+        j4c12 => "08:-:-:-:1  \@P0 $convert store0E7, load0E3.H1;\n",
+        j4c16 => "--:-:-:-:1  \@P0 $convert store0E6, load0E3.H0;\n",
+        j4c20 => "--:-:-:-:1  \@P0 $convert store0E5, load0E2.H1;\n",
+        j4c24 => "--:-:6:-:1  \@P0 $convert store0E4, load0E2.H0;\n",
+        j4c28 => "--:-:-:-:1  \@P0 $convert store0E3, load0E1.H1;\n",
+        j4c32 => "--:-:-:-:1  \@P0 $convert store0E2, load0E1.H0;\n",
+        j4c36 => "--:-:-:-:1  \@P0 $convert store0E1, load0E0.H1;\n",
+        j4c40 => "--:-:4:-:1  \@P0 $convert store0E0, load0E0.H0;\n",
+        j4c42 => "20:-:-:-:1  \@P0 STS.128 [writeEs + 4x<0*64 + 32>], store0E4;\n",
+        j4c44 => "--:-:-:-:1  \@P0 IADD   track0E0.CC, track0E0, param_MPQN8;\n",
+        j4c49 => "--:-:-:-:1  \@P0 IADD.X track0E1,    track0E1, RZ;\n",
+        j4c56 => "08:4:-:-:1  \@P0 STS.128 [writeEs + 4x<0*64 +  0>], store0E0;\n",
+        j5c8  => "08:-:4:-:1  \@P2 LDG.E.128 load0E0, [track0E];\n",
+
+        j5c12 => "10:-:-:-:1  \@P0 $convert store4E7, load4E3.H1;\n",
+        j5c16 => "--:-:-:-:1  \@P0 $convert store4E6, load4E3.H0;\n",
+        j5c20 => "--:-:-:-:1  \@P0 $convert store4E5, load4E2.H1;\n",
+        j5c24 => "--:-:6:-:1  \@P0 $convert store4E4, load4E2.H0;\n",
+        j5c28 => "--:-:-:-:1  \@P0 $convert store4E3, load4E1.H1;\n",
+        j5c32 => "--:-:-:-:1  \@P0 $convert store4E2, load4E1.H0;\n",
+        j5c36 => "--:-:-:-:1  \@P0 $convert store4E1, load4E0.H1;\n",
+        j5c40 => "--:-:5:-:1  \@P0 $convert store4E0, load4E0.H0;\n",
+        j5c42 => "20:-:-:-:1  \@P0 STS.128 [writeEs + 4x<4*64 + 32>], store4E4;\n",
+        j5c44 => "--:-:-:-:1  \@P0 IADD   track4E0.CC, track4E0, param_MPQN8;\n",
+        j5c49 => "--:-:-:-:1  \@P0 IADD.X track4E1,    track4E1, RZ;\n",
+        j5c56 => "10:5:-:-:1  \@P0 STS.128 [writeEs + 4x<4*64 +  0>], store4E0;\n",
+        j6c8  => "10:-:5:-:1  \@P2 LDG.E.128 load4E0, [track4E];\n",
+
+        j6c63 => "20:-:-:-:1  \@P0 IADD readEs,  readEs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeEs, writeEs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c8  => "--:-:-:-:1      ISETP.GT.AND P1, PT, K, RZ, P1;\n",
+        j7c10 => "--:-:-:-:1      ISETP.GT.AND P2, PT, K, RZ, PT;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_8K;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+        my $barrier  = $j == 6 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $barrier, $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2F|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+--:-:-:-:0      MOV warp_cnt, 32;
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkF, SR_CTAID.Y;
+--:-:3:-:1      S2R blkE, SR_CTAID.Z;
+01:-:-:-:6      MOV rst,  tid;
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_cnt < RST (c=0)
+--:-:-:-:1      ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT;
+--:-:-:-:1      IADD warp_cnt, warp_cnt, 32;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = qs + s
+// y = pr + r
+// z = mt + t
+--:-:-:-:1      IADD z, mt, t;
+--:-:-:-:1      IADD y, pr, r;
+--:-:-:-:1      IADD x, qs, s;
+// i = (z*HWN + y*WN + x*N) * 4
+20:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD      sliceI, x, param_N,   sliceI;
+--:-:-:-:1      SHL       sliceI, sliceI, 1;
+// Bounds check x and y, and make i negative if outside
+--:-:-:-:1      ISET.LT.AND x0, x, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW, x,  param_W, PT;
+--:-:-:-:1      ISET.LT.AND y0, y, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH, y,  param_H, PT;
+--:-:-:-:1      ISET.LT.AND z0, z, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD, z,  param_D, PT;
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, x0, xW, 0xfe;
+<ORDERED>
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, y0, yH, 0xfe;
+--:-:-:-:1      SHL lutStore, rst, 2;
+--:-:-:-:1      IADD rst, rst, 32;
+</ORDERED>
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, z0, zD, 0xfe;
+// Store i imgOffset into the shared lookup table
+--:6:-:-:1      STS [lutStore + addr_lut], sliceI;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U LUT_LOOP;
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV RST,       param_RST;
+--:-:-:-:1      MOV DHWN1,     param_DHWN;
+--:-:-:-:1      SHL DHWN1,     DHWN1, 1;
+
+--:-:-:-:1      LOP.AND readEs, readEs, 0x7f;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x3f;
+
+// Expand back out to undo our bank conflict avoiding stride
+--:-:-:-:1      SHL readEs, readEs, 1;
+
+// writeCs = ((readIs / 4) * 64 + readEs) / 2;
+--:-:-:-:1      ISCADD  writeCs, readFs, readEs, 4;
+--:-:-:-:1      SHR.U32 writeCs, writeCs, 1;
+
+// readCs = (tid & 31) << 2;
+--:-:-:-:1      LOP.AND tid31,  tid,   31;
+--:-:-:-:1      SHL     readCs, tid31, 2;
+
+// nn = blkE*64 + tid31 << 1;
+--:-:-:-:1      SHL tid31, tid31, 1;
+04:-:-:-:1      ISCADD nn, blkE, tid31, 6;
+
+// crst = blkF*32
+02:-:-:-:1      SHL  crst00, blkF,   5;
+--:-:-:-:1      IADD crst04, crst00, 4;
+--:-:-:-:1      IADD crst08, crst00, 8;
+--:-:-:-:1      IADD crst12, crst00, 12;
+
+--:-:-:-:1      LEA      trackI0.CC, nn, param_I[0],     1;
+--:-:-:-:1      LEA.HI.X trackI1,    nn, param_I[1], RZ, 1;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P5, PT, nn, param_N, PT;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:1      IADD crst00, crst00, 12;\n" .
+            "--:-:-:-:1      IADD crst04, crst04, 12;\n" .
+            "--:-:-:-:1      IADD crst08, crst08, 12;\n" .
+            "--:-:-:-:1      IADD crst12, crst12, 12;\n" if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+// Round nearest
+--:-:-:-:1      F2F.F16.F32 c0, c0;
+--:-:1:-:1      F2F.F16.F32 c1, c1;
+--:-:-:-:1      F2F.F16.F32 c2, c2;
+--:-:2:-:1      F2F.F16.F32 c3, c3;
+--:-:-:-:1      F2F.F16.F32 c4, c4;
+--:-:3:-:1      F2F.F16.F32 c5, c5;
+--:-:-:-:1      F2F.F16.F32 c6, c6;
+--:-:4:-:1      F2F.F16.F32 c7, c7;
+
+// Pack 2 16 bit values into 32 bit words
+11:-:-:-:2      BFI cs0, c1, 0x1010, c0;
+02:-:-:-:2      BFI cs1, c3, 0x1010, c2;
+24:-:-:-:2      BFI cs2, c5, 0x1010, c4;
+08:-:-:-:0      BFI cs3, c7, 0x1010, c6;
+
+// Undo the stride in the X dim (items spaced by 32 are actually spaced 4)
+--:-:-:-:4      STS.64 [writeCs+2x<0>], cs0;
+--:-:-:-:1      STS.64 [writeCs+2x<4>], cs2;
+--:-:-:-:1      LDS cs0, [readCs + 2x<0*64>];
+--:-:-:-:1      LDS cs1, [readCs + 2x<1*64>];
+--:-:-:-:1      LDS cs2, [readCs + 2x<2*64>];
+--:-:-:-:1      LDS cs3, [readCs + 2x<3*64>];
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5;
+
+--:-:-:-:1      XMAD.LO2C c00, crst00, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c04, crst04, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c08, crst08, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c12, crst12, param_magic_RST, RZ;
+
+--:-:-:-:1      SHR.U32 c00, c00, param_shift_RST;
+--:-:-:-:1      SHR.U32 c04, c04, param_shift_RST;
+--:-:-:-:1      SHR.U32 c08, c08, param_shift_RST;
+--:-:-:-:1      SHR.U32 c12, c12, param_shift_RST;
+
+--:-:-:-:1      VMAD.U16.U16 lut00, -c00, RST, crst00;
+--:-:-:-:1      VMAD.U16.U16 lut04, -c04, RST, crst04;
+--:-:-:-:1      VMAD.U16.U16 lut08, -c08, RST, crst08;
+--:-:-:-:1      VMAD.U16.U16 lut12, -c12, RST, crst12;
+
+--:-:-:-:1      SHL lut00, lut00, 2;
+--:-:-:-:1      SHL lut04, lut04, 2;
+--:-:-:-:1      SHL lut08, lut08, 2;
+--:-:-:-:1      SHL lut12, lut12, 2;
+
+--:-:-:-:1      XMAD.LO2 chan00, DHWN1, c00, RZ;
+--:-:-:-:1      XMAD.LO2 chan04, DHWN1, c04, RZ;
+--:-:-:-:1      XMAD.LO2 chan08, DHWN1, c08, RZ;
+--:-:-:-:1      XMAD.LO2 chan12, DHWN1, c12, RZ;
+
+--:-:-:-:1      IADD crst00, crst00, 1;
+--:-:-:-:1      IADD crst04, crst04, 1;
+--:-:-:-:1      IADD crst08, crst08, 1;
+--:-:-:-:1      IADD crst12, crst12, 1;
+
+--:-:1:-:1  @P0 LDS img00, [lut00 + addr_lut];
+--:-:2:-:1  @P1 LDS img04, [lut04 + addr_lut];
+--:-:3:-:1  @P2 LDS img08, [lut08 + addr_lut];
+--:-:4:-:1  @P3 LDS img12, [lut12 + addr_lut];
+
+</SCHEDULE_BLOCK>
+
+01:-:-:-:1      IADD3  track00I0.CC, trackI0, img00, chan00;
+--:-:-:-:5      ISETP.GE.AND P0, PT, img00, RZ, P0;
+--:-:-:-:1      IADD.X track00I1,    trackI1, RZ;
+
+02:-:-:-:1      IADD3  track04I0.CC, trackI0, img04, chan04;
+--:-:-:-:5      ISETP.GE.AND P1, PT, img04, RZ, P1;
+--:-:-:-:1      IADD.X track04I1,    trackI1, RZ;
+
+04:-:-:-:1      IADD3  track08I0.CC, trackI0, img08, chan08;
+--:-:-:-:5      ISETP.GE.AND P2, PT, img08, RZ, P2;
+--:-:-:-:1      IADD.X track08I1,    trackI1, RZ;
+
+08:-:-:-:1      IADD3  track12I0.CC, trackI0, img12, chan12;
+--:-:-:-:5      ISETP.GE.AND P3, PT, img12, RZ, P3;
+--:-:-:-:0      IADD.X track12I1,    trackI1, RZ;
+
+--:-:-:-:2  @P0 RED.E.ADD.F16x2.FTZ.RN [track00I], cs0;
+--:5:-:-:2  @P1 RED.E.ADD.F16x2.FTZ.RN [track04I], cs1;
+--:-:-:-:4  @P2 RED.E.ADD.F16x2.FTZ.RN [track08I], cs2;
+--:6:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [track12I], cs3;
+
+--:-:-:-:5      RET;
+
diff --git a/Kernel/Convolution/Pascal/hconv_updat_C128_K128.sass b/Kernel/Convolution/Pascal/hconv_updat_C128_K128.sass
new file mode 100644
index 0000000..d6c9c15
--- /dev/null
+++ b/Kernel/Convolution/Pascal/hconv_updat_C128_K128.sass
@@ -0,0 +1,775 @@
+# Kernel: hconv_updat_C128_K128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(128*16 + 32)*4>
+    addr_blkIE : 4x<(128*16 + 32)*4 + 4>
+    addr_q     : 4x<(128*16 + 32)*4 + 6>
+    szBuf      : (128*16 + 32)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_offset_K     : c[0x0][0x15c]
+    param_N            : c[0x0][0x160]
+    param_K            : c[0x0][0x164]
+    param_D            : c[0x0][0x168]
+    param_H            : c[0x0][0x16c]
+    param_W            : c[0x0][0x170]
+    param_WN           : c[0x0][0x174]
+    param_HWN          : c[0x0][0x178]
+    param_DHWN         : c[0x0][0x17c]
+    param_C            : c[0x0][0x180]
+    param_CRST         : c[0x0][0x184]
+    param_RST          : c[0x0][0x188]
+    param_magic_RST    : c[0x0][0x18c]
+    param_shift_RST    : c[0x0][0x190]
+    param_RS           : c[0x0][0x194]
+    param_magic_RS     : c[0x0][0x198]
+    param_shift_RS     : c[0x0][0x19c]
+    param_S            : c[0x0][0x1a0]
+    param_magic_S      : c[0x0][0x1a4]
+    param_shift_S      : c[0x0][0x1a8]
+    param_pad_d        : c[0x0][0x1ac]
+    param_pad_h        : c[0x0][0x1b0]
+    param_pad_w        : c[0x0][0x1b4]
+    param_str_d        : c[0x0][0x1b8]
+    param_str_h        : c[0x0][0x1bc]
+    param_str_w        : c[0x0][0x1c0]
+    param_dil_d        : c[0x0][0x1c4]
+    param_dil_h        : c[0x0][0x1c8]
+    param_dil_w        : c[0x0][0x1cc]
+    param_P            : c[0x0][0x1d0]
+    param_Q            : c[0x0][0x1d4]
+    param_PQ           : c[0x0][0x1d8]
+    param_QN           : c[0x0][0x1dc]
+    param_PQN          : c[0x0][0x1e0]
+    param_MPQN         : c[0x0][0x1e4]
+    param_magic_Q      : c[0x0][0x1e8]
+    param_shift_Q      : c[0x0][0x1ec]
+    param_magic_PQ     : c[0x0][0x1f0]
+    param_shift_PQ     : c[0x0][0x1f4]
+    param_grid_P       : c[0x0][0x1f8]
+    param_grid_Q       : c[0x0][0x1fc]
+    param_grid_PQ      : c[0x0][0x200]
+    param_CRSTK        : c[0x0][0x204]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-65   : one
+    64-65   : blkIE<0-1>
+    64-68   : blkI, blkE, tid, tidX, tidY
+    69-95   ~ blkMPQ, tid1, tid7, tid128, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+    69-95   ~ c, z, y, x, k, te, mt, pr, qs, r, s, t, rs, rst, crst, ti, xw, xW, yh, yH, zd, zD, cC, nextP, nextQ, Q
+
+    64-79   : j0Ex<0-7>, j0Iy<0-7>
+    80-95   : j1Ex<0-7>, j1Iy<0-7>
+
+     96-99  : loadI<0-3>
+     96-99  : storeI<0-3>
+    100-103 : loadI<4-7>
+    112-115 : storeI<4-7>
+
+    104-107 : loadE<0-3>
+    104-107 : storeE<0-3>
+    108-111 : loadE<4-7>
+    112-115 : storeE<4-7>
+
+    116-119 : trackI<0-1>, trackE<0-1>
+
+    120-124 ~ writeS, loopN, m, p, q
+    125-127 ~ readIs, readEs, swapBuf
+
+     72-87  : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+    88-124  ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128, blk_MPQ, CRSTK, xmad_determ
+
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID1;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkI,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+--:-:-:-:5      BRA.U END_CTAID1;
+CTAID1:
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.Z;
+--:-:3:-:1      S2R blkI,   SR_CTAID.X;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Y;
+END_CTAID1:
+
+<SCHEDULE_BLOCK>
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 3
+// shiftX = (tid & 1) << 4
+01:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tid1, 3;
+--:-:-:-:1      SHL     shiftX, tid1, 4;
+
+0c:-:-:-:1      STS.64 [addr_blkIE], blkIE;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
++]
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_grid_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_grid_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+
+--:-:-:-:1      STS [addr_q], q;
+
+// writeS = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeS, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeS, writeS, shiftX;
+--:-:-:-:1      ISCADD writeS, writeS, 4x<szBuf * 2>, 2;
+
+// readIs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readIs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf;
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readEs, tid128, 4;
+--:-:-:-:1      LOP.OR  readEs, readEs, tid7;
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<szBuf>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szBuf * 2>;
+
+--:-:-:-:1      MOV loopN, RZ;
+
+// Flag for first load branch
+--:-:-:-:1      PSETP.AND.AND P0, PT, PT, PT, PT;
+
+</SCHEDULE_BLOCK>
+
+NEXT_PQ:
+
+--:-:2:-:1      S2R tid, SR_TID.X;
+--:-:3:-:1      LDS.U.64 blkIE, [addr_blkIE];
+
+<SCHEDULE_BLOCK>
+// Zigzag q but only if grid_P < P
+--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
+--:-:-:-:1      MOV Q, param_grid_P;
+--:-:-:-:1      ISETP.LT.AND P1, PT, Q, param_P, P1;
+--:-:-:-:1      MOV Q, -1;
+--:-:-:-:1  @P1 IADD3 Q, -q, param_Q, Q;
+--:-:-:-:1 @!P1 MOV Q, q;
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 3
+--:-:-:-:1      LOP.AND tidY,   tid,  1;
+02:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tidY, 3;
+// crst = blockI*128 + tid
+04:-:-:-:1      ISCADD crst, blkI, tidX, 7;
+// k = blockE*128 + tid
+04:-:-:-:1      ISCADD k, blkE, tidX, 7;
+--:-:-:-:1      IADD   k, k, param_offset_K;
+
+// c   = crst / RST
+// rst = crst % RST
+--:-:-:-:1      XMAD.LO2C c, crst, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32   c, c, param_shift_RST;
+--:-:-:-:1      XMAD rst, c, param_RST, RZ;
+--:-:-:-:1      IADD rst, -rst, crst;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = q * v - pad_w + (s * dil_w)
+// y = p * u - pad_h + (r * dil_h)
+// z = m * w - pad_d + (t * dil_d)
+--:-:-:-:1      XMAD  qs, Q,  param_str_w, RZ;
+--:-:-:-:1      XMAD  pr, p,  param_str_h, RZ;
+--:-:-:-:1      XMAD  mt, m,  param_str_d, RZ;
+--:-:-:-:1      XMAD  x,  s,  param_dil_w, qs;
+--:-:-:-:1      XMAD  y,  r,  param_dil_h, pr;
+--:-:-:-:1      XMAD  z,  t,  param_dil_d, mt;
+--:-:-:-:1      IADD  x,  x, -param_pad_w;
+--:-:-:-:1      IADD  y,  y, -param_pad_h;
+--:-:-:-:1      IADD  z,  z, -param_pad_d;
+// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY
+--:-:-:-:1      XMAD      ti, x, param_N,    tidY;
+--:-:-:-:1      XMAD.LO2C ti, y, param_WN,   ti;
+--:-:-:-:1      XMAD.LO2C ti, z, param_HWN,  ti;
+--:-:-:-:1      XMAD.LO2C ti, c, param_DHWN, ti;
+--:-:-:-:1      LEA      trackI0.CC, ti, param_I[0],     1;
+--:-:-:-:1      LEA.HI.X trackI1,    ti, param_I[1], RZ, 1;
+// trackE = k*MPQN + m*PQN + p*QN + tidY
+--:-:-:-:1      XMAD      te, Q, param_N,    tidY;
+--:-:-:-:1      XMAD.LO2C te, p, param_QN,   te;
+--:-:-:-:1      XMAD.LO2C te, m, param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te, k, param_MPQN, te;
+--:-:-:-:1      LEA      trackE0.CC, te, param_E[0],     1;
+--:-:-:-:1      LEA.HI.X trackE1,    te, param_E[1], RZ, 1;
+// Bounds check x,y,z,c for each I track.
+// If out of bounds, this will set the track address to -1
+--:-:-:-:1      ISET.GE.AND cC, c, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd, z, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD, z, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh, y, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH, y, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw, x, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW, x, param_W, PT;
+--:-:-:-:1      LOP.OR   trackI0, trackI0, cC;
+--:-:-:-:1      LOP3.LUT trackI0, trackI0, zd, zD, 0xfe;
+--:-:-:-:1      LOP3.LUT trackI0, trackI0, yh, yH, 0xfe;
+--:-:-:-:1      LOP3.LUT trackI0, trackI0, xw, xW, 0xfe;
+
+01:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:0      ISETP.NE.AND P2, PT, trackI0, -1, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, k, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+--:-:-:-:1      IADD loopN, loopN, param_N;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P0 BRA.U FIRST_LOAD;
+
+INIT_LOOP:
+
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*128 + 64>];
+--:-:1:-:2      LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>];
+
+NEXT_16N:
+
+[+
+
+    our $convert;
+    my %insert =
+    (
+        j0c8   => "--:-:-:-:1      IADD loopN, loopN, -16;\n",
+
+        # p0 = (N & 16) == 0
+        # p1 = N >= 32 && p0
+        j0c14   => "--:-:-:-:1      LOP.AND.NZ P0, RZ, loopN, 16;\n",
+        j0c28   => "--:-:-:-:1      ISETP.GE.AND P1, PT, loopN, 32, P0;\n",
+
+        j2c34   => "--:-:-:-:1  \@P0 $convert storeI7, loadI7.H1;\n",
+        j2c38   => "--:-:-:-:1  \@P0 $convert storeI6, loadI7.H0;\n",
+        j2c42   => "--:-:-:-:1  \@P0 $convert storeI5, loadI6.H1;\n",
+        j2c46   => "--:-:-:-:1  \@P0 $convert storeI4, loadI6.H0;\n",
+        j2c50   => "--:-:-:-:1  \@P0 $convert storeI3, loadI5.H1;\n",
+        j2c54   => "--:-:-:-:1  \@P0 $convert storeI2, loadI5.H0;\n",
+        j2c58   => "--:-:-:-:1  \@P0 $convert storeI1, loadI4.H1;\n",
+        j2c62   => "--:-:-:-:1  \@P0 $convert storeI0, loadI4.H0;\n",
+
+        j3c34   => "02:-:-:-:1 \@!P0 $convert storeI7, loadI3.H1;\n",
+        j3c38   => "--:-:-:-:1 \@!P0 $convert storeI6, loadI3.H0;\n",
+        j3c42   => "--:-:-:-:1 \@!P0 $convert storeI5, loadI2.H1;\n",
+        j3c46   => "--:-:5:-:1 \@!P0 $convert storeI4, loadI2.H0;\n",
+        j3c50   => "--:-:-:-:1 \@!P0 $convert storeI3, loadI1.H1;\n",
+        j3c54   => "--:-:-:-:1 \@!P0 $convert storeI2, loadI1.H0;\n",
+        j3c58   => "--:-:-:-:1 \@!P0 $convert storeI1, loadI0.H1;\n",
+        j3c62   => "--:-:2:-:1 \@!P0 $convert storeI0, loadI0.H0;\n",
+
+        j4c8    => "10:-:-:-:1      STS [writeS + 4x<7*128>], storeI7;\n",
+        j4c10   => "--:-:-:-:1      STS [writeS + 4x<6*128>], storeI6;\n",
+        j4c12   => "--:-:-:-:1      STS [writeS + 4x<5*128>], storeI5;\n",
+        j4c14   => "--:-:-:-:1      STS [writeS + 4x<4*128>], storeI4;\n",
+        j4c16   => "02:-:-:-:1      STS [writeS + 4x<3*128>], storeI3;\n",
+        j4c18   => "--:-:-:-:1      STS [writeS + 4x<2*128>], storeI2;\n",
+        j4c20   => "--:-:-:-:1      STS [writeS + 4x<1*128>], storeI1;\n",
+        j4c22   => "--:2:-:-:1      STS [writeS + 4x<0*128>], storeI0;\n",
+
+        j4c24   => "--:-:-:-:1      ISETP.NE.AND P2, PT, trackI0, -1, P1;\n",
+        j4c26   => "--:-:-:-:1      ISETP.EQ.AND P3, PT, trackI0, -1, P1;\n",
+
+        j5c8    => "02:-:-:-:1  \@P2 LDG.E.CI.128 loadI0, [trackI + 2x< 0>];\n",
+        j5c10   => "--:5:2:-:1  \@P2 LDG.E.CI.128 loadI4, [trackI + 2x<16>];\n",
+
+        j6c8    => "--:-:-:-:1  \@P3 LDS.U.128 loadI0, [addr_zero];\n",
+        j7c8    => "--:-:-:-:1  \@P3 LDS.U.128 loadI4, [addr_zero];\n",
+
+        j7c57   => "10:-:-:-:1  \@P2 IADD   trackI0.CC, trackI0, 2x<32>;\n",
+        j7c63   => "--:-:-:-:1  \@P2 IADD.X trackI1,    trackI1, RZ;\n",
+
+
+        j10c34  => "--:-:-:-:1  \@P0 $convert storeE7, loadE7.H1;\n",
+        j10c38  => "--:-:-:-:1  \@P0 $convert storeE6, loadE7.H0;\n",
+        j10c42  => "--:-:-:-:1  \@P0 $convert storeE5, loadE6.H1;\n",
+        j10c46  => "--:-:-:-:1  \@P0 $convert storeE4, loadE6.H0;\n",
+        j10c50  => "--:-:-:-:1  \@P0 $convert storeE3, loadE5.H1;\n",
+        j10c54  => "--:-:-:-:1  \@P0 $convert storeE2, loadE5.H0;\n",
+        j10c58  => "--:-:-:-:1  \@P0 $convert storeE1, loadE4.H1;\n",
+        j10c62  => "--:-:-:-:1  \@P0 $convert storeE0, loadE4.H0;\n",
+
+        j11c34  => "04:-:-:-:1 \@!P0 $convert storeE7, loadE3.H1;\n",
+        j11c38  => "--:-:-:-:1 \@!P0 $convert storeE6, loadE3.H0;\n",
+        j11c42  => "--:-:-:-:1 \@!P0 $convert storeE5, loadE2.H1;\n",
+        j11c46  => "--:-:5:-:1 \@!P0 $convert storeE4, loadE2.H0;\n",
+        j11c50  => "--:-:-:-:1 \@!P0 $convert storeE3, loadE1.H1;\n",
+        j11c54  => "--:-:-:-:1 \@!P0 $convert storeE2, loadE1.H0;\n",
+        j11c58  => "--:-:-:-:1 \@!P0 $convert storeE1, loadE0.H1;\n",
+        j11c62  => "--:-:3:-:1 \@!P0 $convert storeE0, loadE0.H0;\n",
+
+        j12c8   => "10:-:-:-:1      STS [writeS + 4x<7*128 + szBuf>], storeE7;\n",
+        j12c10  => "--:-:-:-:1      STS [writeS + 4x<6*128 + szBuf>], storeE6;\n",
+        j12c12  => "--:-:-:-:1      STS [writeS + 4x<5*128 + szBuf>], storeE5;\n",
+        j12c14  => "--:-:-:-:1      STS [writeS + 4x<4*128 + szBuf>], storeE4;\n",
+        j12c16  => "04:-:-:-:1      STS [writeS + 4x<3*128 + szBuf>], storeE3;\n",
+        j12c18  => "--:-:-:-:1      STS [writeS + 4x<2*128 + szBuf>], storeE2;\n",
+        j12c20  => "--:-:-:-:1      STS [writeS + 4x<1*128 + szBuf>], storeE1;\n",
+        j12c22  => "--:3:-:-:1      STS [writeS + 4x<0*128 + szBuf>], storeE0;\n",
+
+        j12c24  => "--:-:-:-:1      PSETP.AND.AND P2, PT, P1, P4, PT;\n",
+
+        j13c8   => "04:-:-:-:1  \@P2 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];\n",
+        j13c10  => "--:5:3:-:1  \@P2 LDG.E.CI.128 loadE4, [trackE + 2x<16>];\n",
+
+        j15c57  => "10:-:-:-:1  \@P2 IADD   trackE0.CC, trackE0, 2x<32>;\n",
+        j15c62  => "--:-:-:-:1  \@P2 IADD.X trackE1,    trackE1, RZ;\n",
+
+        # p0 = N >= 16 and not (N == 32 and (p or q))
+        j14c8   => "--:-:-:-:1      ISETP.EQ.AND  P0, PT, loopN, 32, PT;\n",
+        j14c10  => "--:-:-:-:1      ISETP.GE.AND  P1, PT, loopN, 16, PT;\n",
+        j14c22  => "--:-:-:-:1      PSETP.OR.AND  P0, PT, P5, P6, P0;\n",
+        j14c35  => "--:-:-:-:1      PSETP.AND.AND P0, PT, !P0, P1, PT;\n",
+
+        j14c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                   "20:-:-:-:1      IADD readEs, readEs, -swapBuf;\n" .
+                   "--:-:-:-:1      IADD readIs, readIs, -swapBuf;\n" .
+                   "--:-:-:-:1      IADD writeS, writeS,  swapBuf;\n" .
+                   "--:-:-:-:1      IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j15c63  => "--:-:-:Y:5  \@P0 BRA.U NEXT_16N;\n" .
+                   "--:-:-:-:0  \@P5 IADD q, q, param_grid_Q;\n" .
+                   "01:-:-:Y:5  \@P5 BRA.U NEXT_PQ;\n" .
+                   "--:-:1:-:1  \@P6 LDS q, [addr_q];\n" .
+                   "--:-:-:-:0  \@P6 IADD p, p, param_grid_P;\n" .
+                   "--:-:-:Y:5  \@P6 BRA.U NEXT_PQ;\n" .
+                   "--:-:-:Y:5      BRA.U FINISH;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 15)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = 1 - $odd;
+        my $rsOffset = ($j + 1) & 15;
+        my $rsPred   = $j == 15 ? '@P0' : '   ';
+        my $shift    = $rsOffset < 8 ? 0 : 1;
+        my $barrier  = $j == 14 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|F2F|I2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
++]
+
+FIRST_LOAD:
+
+--:-:-:-:8      PSETP.AND.AND P0, PT, PT, PT, !PT;
+
+// p1 = N == 32 and (p or q)
+--:-:-:-:0      ISETP.EQ.AND  P1, PT, loopN, 32, PT;
+
+--:-:-:-:1  @P2 LDG.E.CI.128 loadI0, [trackI + 2x< 0>];
+--:-:1:-:1  @P2 LDG.E.CI.128 loadI4, [trackI + 2x<16>];
+--:-:-:-:1 @!P2 LDS.U.128    loadI0, [addr_zero];
+--:-:5:-:1 @!P2 LDS.U.128    loadI4, [addr_zero];
+
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];
+--:-:2:-:1  @P4 LDG.E.CI.128 loadE4, [trackE + 2x<16>];
+--:-:-:-:1 @!P4 LDS.U.128    loadE0, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128    loadE4, [addr_zero];
+
+11:-:-:-:1      [+ convert() +] storeI7, loadI3.H1;
+--:-:-:-:1      [+ convert() +] storeI6, loadI3.H0;
+--:-:-:-:1      [+ convert() +] storeI5, loadI2.H1;
+--:-:1:-:1      [+ convert() +] storeI4, loadI2.H0;
+--:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:5:-:1      [+ convert() +] storeI0, loadI0.H0;
+
+--:-:-:-:1      PSETP.OR.AND  P1, PT, P5, P6, P1;
+
+--:-:-:-:0  @P2 IADD   trackI0.CC, trackI0, 2x<32>;
+
+01:-:-:-:1      STS [writeS + 4x<7*128>], storeI7;
+--:-:-:-:1      STS [writeS + 4x<6*128>], storeI6;
+--:-:-:-:1      STS [writeS + 4x<5*128>], storeI5;
+--:-:-:-:1      STS [writeS + 4x<4*128>], storeI4;
+10:-:-:-:1      STS [writeS + 4x<3*128>], storeI3;
+--:-:-:-:1      STS [writeS + 4x<2*128>], storeI2;
+--:-:-:-:1      STS [writeS + 4x<1*128>], storeI1;
+--:1:-:-:2      STS [writeS + 4x<0*128>], storeI0;
+
+--:-:-:-:0  @P2 IADD.X trackI1,    trackI1, RZ;
+
+23:-:-:-:1      [+ convert() +] storeE7, loadE3.H1;
+--:-:-:-:1      [+ convert() +] storeE6, loadE3.H0;
+--:-:-:-:1      [+ convert() +] storeE5, loadE2.H1;
+--:-:2:-:1      [+ convert() +] storeE4, loadE2.H0;
+--:-:-:-:1      [+ convert() +] storeE3, loadE1.H1;
+--:-:-:-:1      [+ convert() +] storeE2, loadE1.H0;
+--:-:-:-:1      [+ convert() +] storeE1, loadE0.H1;
+--:-:6:-:1      [+ convert() +] storeE0, loadE0.H0;
+
+--:-:-:-:2      PSETP.AND.AND P5, PT, P5, P1, PT;
+--:-:-:-:1      PSETP.AND.AND P6, PT, P6, P1, PT;
+--:-:-:-:0  @P4 IADD   trackE0.CC, trackE0, 2x<32>;
+
+02:-:-:-:1      STS [writeS + 4x<7*128 + szBuf>], storeE7;
+--:-:-:-:1      STS [writeS + 4x<6*128 + szBuf>], storeE6;
+--:-:-:-:1      STS [writeS + 4x<5*128 + szBuf>], storeE5;
+--:-:-:-:1      STS [writeS + 4x<4*128 + szBuf>], storeE4;
+20:-:-:-:1      STS [writeS + 4x<3*128 + szBuf>], storeE3;
+--:-:-:-:1      STS [writeS + 4x<2*128 + szBuf>], storeE2;
+--:-:-:-:1      STS [writeS + 4x<1*128 + szBuf>], storeE1;
+--:1:-:-:1      STS [writeS + 4x<0*128 + szBuf>], storeE0;
+
+--:-:-:-:1  @P4 IADD.X trackE1,    trackE1, RZ;
+
+--:-:-:-:1      IADD readEs, readEs, -swapBuf;
+--:-:-:-:0      IADD readIs, readIs, -swapBuf;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS,  swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,    -swapBuf;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:0  @P5 IADD q, q, param_grid_Q;
+--:-:-:Y:5  @P5 BRA.U NEXT_PQ;
+--:-:-:-:0  @P6 IADD p, p, param_grid_P;
+--:-:-:Y:5  @P6 BRA.U NEXT_PQ;
+
+--:-:-:-:2      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:Y:5      BRA.U INIT_LOOP;
+
+
+FINISH:
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID2;
+--:-:2:-:1      S2R blkI,    SR_CTAID.Y;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Z;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.X;
+--:-:-:-:5      BRA.U END_CTAID2;
+CTAID2:
+--:-:2:-:1      S2R blkI,    SR_CTAID.X;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Y;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.Z;
+END_CTAID2:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readEs,  readEs, -4x<szBuf>;
+--:-:-:-:1  @P0 IADD readIs,  readIs, -swapBuf;
+--:-:-:-:1  @P0 IADD readEs,  readEs, -swapBuf;
+
+// writeCs = (readIs / 4) * 128 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 5;
+
+--:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      LOP.AND tid96,  tid,  96;
+01:-:-:-:1      LOP.AND t128,   tid, 128;
+
+// kk = tid31 | (t128 >> 2);
+--:-:-:-:1      SHR.U32  kk, t128, 2;
+--:-:-:-:1      LOP.OR   kk, tid31,  kk;
+
+// readCs = ((tid96 << 4) | kk) << 2;
+--:-:-:-:1      SHL      readCs, tid96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, kk;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// kk += blkE*128;
+04:-:-:-:1      ISCADD kk, blkE, kk, 7;
+--:-:-:-:1      IADD   kk, kk, param_offset_K;
+
+// crst = blkI*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 crst00, tid96, 1;
+02:-:-:-:1      ISCADD  crst00, blkI, crst00, 7;
+--:-:-:-:1      IADD    crst04, crst00,  4;
+--:-:-:-:1      IADD    crst08, crst00,  8;
+--:-:-:-:1      IADD    crst12, crst00,  12;
+
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1, K, 2;
+--:-:-:-:1      SHL K4, K, 4;
+--:-:-:-:1      ISCADD K60, K, -K4, 8;
+
+// trackF += crst*K + k;
+--:-:-:-:1      VMAD.U16.U16 tf, crst00, K, kk;
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSTK, param_CRSTK;
+08:-:-:-:1      XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ;
+        };
+    }
+    return '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0],     0x2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 0x2;
+
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P5, PT, kk, param_K, PT;
+--:-:-:-:1      IADD kk, kk, 64;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, PT;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   track04F0.CC, track00F0, K4;
+--:-:-:-:1      IADD.X track04F1,    track00F1, RZ;
+--:-:-:-:6      IADD   track08F0.CC, track04F0, K4;
+--:-:-:-:1      IADD.X track08F1,    track04F1, RZ;
+--:-:-:-:6      IADD   track12F0.CC, track08F0, K4;
+--:-:-:-:0      IADD.X track12F1,    track08F1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+[+
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   track00F0.CC, track00F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst00,       crst00,     60;\n" .
+            "--:-:-:-:1      IADD.X track00F1,    track00F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track04F0.CC, track04F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst04,       crst04,     60;\n" .
+            "--:-:-:-:1      IADD.X track04F1,    track04F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track08F0.CC, track08F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst08,       crst08,     60;\n" .
+            "--:-:-:-:1      IADD.X track08F1,    track08F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track12F0.CC, track12F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst12,       crst12,     60;\n" .
+            "--:-:-:-:1      IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL f0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL f7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
++]
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
+--:-:-:-:1      IADD         crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
+--:-:-:-:1      IADD         crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
+--:-:-:-:1      IADD         crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
+--:-:-:-:0      IADD         crst12, crst12, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], f0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], f4;
+
+--:-:1:-:1      LDS f0, [readCs + 4x<0*128 + 00>];
+--:-:2:-:1      LDS f2, [readCs + 4x<1*128 + 00>];
+--:-:3:-:1      LDS f4, [readCs + 4x<2*128 + 00>];
+--:-:4:-:a      LDS f6, [readCs + 4x<3*128 + 00>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:-:-:-:1  @P0 STG.E.CG [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 STG.E.CG [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 STG.E.CG [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 STG.E.CG [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
+    else
+    {
+        return q{
+01:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
++]
+
+--:-:1:-:1      LDS f1, [readCs + 4x<0*128 + 64>];
+--:-:2:-:1      LDS f3, [readCs + 4x<1*128 + 64>];
+--:-:3:-:1      LDS f5, [readCs + 4x<2*128 + 64>];
+--:-:4:-:a      LDS f7, [readCs + 4x<3*128 + 64>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:1:-:-:1  @P0 STG.E.CG [track00F + 4x<64>], f1;
+02:2:-:-:1  @P1 STG.E.CG [track04F + 4x<64>], f3;
+04:3:-:-:1  @P2 STG.E.CG [track08F + 4x<64>], f5;
+08:4:-:-:1  @P3 STG.E.CG [track12F + 4x<64>], f7;
+        };
+    }
+    else
+    {
+        return q{
+01:1:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], f1;
+02:2:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], f3;
+04:3:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], f5;
+08:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], f7;
+        };
+    }
++]
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Pascal/hconv_updat_C128_K64.sass b/Kernel/Convolution/Pascal/hconv_updat_C128_K64.sass
new file mode 100644
index 0000000..a40fcb8
--- /dev/null
+++ b/Kernel/Convolution/Pascal/hconv_updat_C128_K64.sass
@@ -0,0 +1,860 @@
+# Kernel: hconv_updat_C128_K64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + (64*16 + 32)*2>
+    szShareI  : (128*16 + 32)
+    szShareE  : (64*16  + 32)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_offset_K     : c[0x0][0x15c]
+    param_N            : c[0x0][0x160]
+    param_K            : c[0x0][0x164]
+    param_D            : c[0x0][0x168]
+    param_H            : c[0x0][0x16c]
+    param_W            : c[0x0][0x170]
+    param_WN           : c[0x0][0x174]
+    param_HWN          : c[0x0][0x178]
+    param_DHWN         : c[0x0][0x17c]
+    param_C            : c[0x0][0x180]
+    param_CRST         : c[0x0][0x184]
+    param_RST          : c[0x0][0x188]
+    param_magic_RST    : c[0x0][0x18c]
+    param_shift_RST    : c[0x0][0x190]
+    param_RS           : c[0x0][0x194]
+    param_magic_RS     : c[0x0][0x198]
+    param_shift_RS     : c[0x0][0x19c]
+    param_S            : c[0x0][0x1a0]
+    param_magic_S      : c[0x0][0x1a4]
+    param_shift_S      : c[0x0][0x1a8]
+    param_pad_d        : c[0x0][0x1ac]
+    param_pad_h        : c[0x0][0x1b0]
+    param_pad_w        : c[0x0][0x1b4]
+    param_str_d        : c[0x0][0x1b8]
+    param_str_h        : c[0x0][0x1bc]
+    param_str_w        : c[0x0][0x1c0]
+    param_dil_d        : c[0x0][0x1c4]
+    param_dil_h        : c[0x0][0x1c8]
+    param_dil_w        : c[0x0][0x1cc]
+    param_P            : c[0x0][0x1d0]
+    param_Q            : c[0x0][0x1d4]
+    param_PQ           : c[0x0][0x1d8]
+    param_QN           : c[0x0][0x1dc]
+    param_PQN          : c[0x0][0x1e0]
+    param_MPQN         : c[0x0][0x1e4]
+    param_magic_Q      : c[0x0][0x1e8]
+    param_shift_Q      : c[0x0][0x1ec]
+    param_magic_PQ     : c[0x0][0x1f0]
+    param_shift_PQ     : c[0x0][0x1f4]
+    param_grid_P       : c[0x0][0x1f8]
+    param_grid_Q       : c[0x0][0x1fc]
+    param_grid_PQ      : c[0x0][0x200]
+    param_CRSTK        : c[0x0][0x204]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-67   ~ tid, blkI, blkE, one
+    68-99   ~ blkMPQ, tidX, tid1, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+    64-72   ~ c<0-1>, z<0-1>, y<0-1>, x<0-1>, Q
+    73-99   ~ mt, pr, qs, r<0-1>, s<0-1>, t<0-1>, rst<0-1>, rs<0-1>
+    73-99   ~ te, ti<0-1>, xw<0-1>, xW<0-1>, yh<0-1>, yH<0-1>, zd<0-1>, zD<0-1>, cC<0-1>, nextP, nextQ
+
+    64-79   : j0Ex<0-7>, j0Iy<0-7>
+    80-95   : j1Ex<0-7>, j1Iy<0-7>
+
+    100-131 : load0I<0-7>,  load1I<0-7>,  loadE<0-7>, storeX<0-7>
+    132-137 : track0I<0-1>, track1I<0-1>, trackE<0-1>
+
+    138-164 ~ writeIs, writeEs, loopN, m, p, q, qq, k, crst<0-1>, tidY
+    165-167 ~ readIs, readEs, swapBuf
+
+     68-83  : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+     84-164 ~ K, K4, K1, K60, tid31, tid96, kk, tf, writeCs, readCs, crst<00|04|08|12>, alpha, blk_MPQ, CRSTK, xmad_determ
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID1;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkI,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+--:-:-:-:5      BRA.U END_CTAID1;
+CTAID1:
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.Z;
+--:-:3:-:1      S2R blkI,   SR_CTAID.X;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Y;
+END_CTAID1:
+
+<SCHEDULE_BLOCK>
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 3
+// shiftX = (tid & 1) << 4
+01:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tid1, 3;
+--:-:-:-:1      SHL     shiftX, tid1, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_grid_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_grid_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+--:-:-:-:1      MOV qq, q;
+
+// writeIs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeIs, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeIs, writeIs, shiftX;
+--:-:-:-:1      ISCADD writeIs, writeIs, 4x<szShareI + szShareE>, 2;
+
+// writeEs = (tidY*64 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeEs, tidY, tidX, 6;
+--:-:-:-:1      IADD   writeEs, writeEs, shiftX;
+--:-:-:-:1      ISCADD writeEs, writeEs, 4x<szShareI*2 + szShareE>, 2;
+
+// readIs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readIs, tid,   -16;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<szShareI>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareI + szShareE>;
+
+// crst = blockI*128 + tid
+04:-:-:-:1      ISCADD crst0, blkI, tidX, 7;
+--:-:-:-:1      IADD   crst1, crst0, 64;
+
+// k = blockE*64 + tid
+08:-:-:-:1      ISCADD k, blkE, tidX, 6;
+--:-:-:-:1      IADD   k, k, param_offset_K;
+
+--:-:-:-:1      MOV loopN, RZ;
+
+// Flag for first load branch
+--:-:-:-:1      PSETP.AND.AND P0, PT, PT, PT, PT;
+</SCHEDULE_BLOCK>
+
+NEXT_PQ:
+
+<SCHEDULE_BLOCK>
+// Zigzag q but only if grid_P < P
+--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
+--:-:-:-:1      MOV Q, param_grid_P;
+--:-:-:-:1      ISETP.LT.AND P1, PT, Q, param_P, P1;
+--:-:-:-:1      MOV Q, -1;
+--:-:-:-:1  @P1 IADD3 Q, -q, param_Q, Q;
+--:-:-:-:1 @!P1 MOV Q, q;
+// c   = crst / RST
+// rst = crst % RST
+--:-:-:-:1      XMAD.LO2C  c0, crst0, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32    c0, c0, param_shift_RST;
+--:-:-:-:1      XMAD rst0, c0, param_RST, RZ;
+--:-:-:-:1      IADD rst0, -rst0, crst0;
+--:-:-:-:1      XMAD.LO2C  c1, crst1, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32    c1, c1, param_shift_RST;
+--:-:-:-:1      XMAD rst1, c1, param_RST, RZ;
+--:-:-:-:1      IADD rst1, -rst1, crst1;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C  t0, rst0, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32    t0, t0, param_shift_RS;
+--:-:-:-:1      XMAD  rs0, t0, param_RS, RZ;
+--:-:-:-:1      IADD  rs0, -rs0, rst0;
+--:-:-:-:1      XMAD.LO2C  t1, rst1, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32    t1, t1, param_shift_RS;
+--:-:-:-:1      XMAD  rs1, t1, param_RS, RZ;
+--:-:-:-:1      IADD  rs1, -rs1, rst1;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C  r0, rs0, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32    r0, r0, param_shift_S;
+--:-:-:-:1      XMAD   s0, r0, param_S, RZ;
+--:-:-:-:1      IADD   s0, -s0, rs0;
+--:-:-:-:1      XMAD.LO2C  r1, rs1, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32    r1, r1, param_shift_S;
+--:-:-:-:1      XMAD   s1, r1, param_S, RZ;
+--:-:-:-:1      IADD   s1, -s1, rs1;
+// z = m * w - pad_d + t
+// y = p * u - pad_h + r
+// x = q * v - pad_w + s
+--:-:-:-:1      XMAD  mt, m,   param_str_d, RZ;
+--:-:-:-:1      XMAD  pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD  qs, Q,   param_str_w, RZ;
+--:-:-:-:1      XMAD  z1, t1,  param_dil_d, mt;
+--:-:-:-:1      XMAD  y1, r1,  param_dil_h, pr;
+--:-:-:-:1      XMAD  x1, s1,  param_dil_w, qs;
+--:-:-:-:1      XMAD  z0, t0,  param_dil_d, mt;
+--:-:-:-:1      XMAD  y1, r0,  param_dil_h, pr;
+--:-:-:-:1      XMAD  x1, s0,  param_str_w, qs;
+--:-:-:-:1      IADD  z1, z1, -param_pad_d;
+--:-:-:-:1      IADD  y1, y1, -param_pad_h;
+--:-:-:-:1      IADD  x1, x1, -param_pad_w;
+--:-:-:-:1      IADD  z0, z0, -param_pad_d;
+--:-:-:-:1      IADD  y0, y0, -param_pad_h;
+--:-:-:-:1      IADD  x0, x0, -param_pad_w;
+</SCHEDULE_BLOCK>
+
+// Split blocks to fit inside of 36 registers
+<SCHEDULE_BLOCK>
+
+// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY
+--:-:-:-:1      XMAD.LO2C ti0, c0, param_DHWN, tidY;
+--:-:-:-:1      XMAD.LO2C ti0, z0, param_HWN,  ti0;
+--:-:-:-:1      XMAD.LO2C ti0, y0, param_WN,   ti0;
+--:-:-:-:1      XMAD      ti0, x0, param_N,    ti0;
+--:-:-:-:1      XMAD.LO2C ti1, c1, param_DHWN, tidY;
+--:-:-:-:1      XMAD.LO2C ti1, z1, param_HWN,  ti1;
+--:-:-:-:1      XMAD.LO2C ti1, y1, param_WN,   ti1;
+--:-:-:-:1      XMAD      ti1, x1, param_N,    ti1;
+--:-:-:-:1      LEA      track0I0.CC, ti0, param_I[0],     1;
+--:-:-:-:1      LEA.HI.X track0I1,    ti0, param_I[1], RZ, 1;
+--:-:-:-:1      LEA      track1I0.CC, ti1, param_I[0],     1;
+--:-:-:-:1      LEA.HI.X track1I1,    ti1, param_I[1], RZ, 1;
+
+// trackE = k*MPQN + m*PQN + p*QN + tidY
+--:-:-:-:1      XMAD.LO2C te, k, param_MPQN, tidY;
+--:-:-:-:1      XMAD.LO2C te, m, param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te, p, param_QN,   te;
+--:-:-:-:1      XMAD      te, Q, param_N,    te;
+--:-:-:-:1      LEA      trackE0.CC, te, param_E[0],     1;
+--:-:-:-:0      LEA.HI.X trackE1,    te, param_E[1], RZ, 1;
+
+// Bounds check x,y,z,c for each I track.
+// If out of bounds, this will set the track address to -1
+--:-:-:-:1      ISET.GE.AND cC0, c0, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd0, z0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD0, z0, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh0, y0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH0, y0, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw0, x0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW0, x0, param_W, PT;
+--:-:-:-:1      LOP.OR   track0I0, track0I0, cC0;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, zd0, zD0, 0xfe;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, yh0, yH0, 0xfe;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, xw0, xW0, 0xfe;
+
+--:-:-:-:1      ISET.GE.AND cC1, c1, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd1, z1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD1, z1, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh1, y1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH1, y1, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw1, x1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW1, x1, param_W, PT;
+--:-:-:-:1      LOP.OR   track1I0, track1I0, cC1;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, zd1, zD1, 0xfe;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, yh1, yH1, 0xfe;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, xw1, xW1, 0xfe;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, PT;
+--:-:-:-:0      ISETP.NE.AND P3, PT, track1I0, -1, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, k, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:-:1      IADD loopN, loopN, param_N;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P0 BRA.U FIRST_LOAD;
+
+INIT_LOOP:
+
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64  + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64  + 32>];
+--:-:1:-:2      LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>];
+
+NEXT_16N:
+
+[+
+
+    our $convert;
+    my %insert =
+    (
+        j0c8   => "--:-:-:-:1      IADD loopN, loopN, -16;\n",
+
+        # p0 = (N & 16) == 0
+        # p1 = N >= 32 && p0
+        j0c14   => "--:-:-:-:1      LOP.AND.NZ P0, RZ, loopN, 16;\n",
+        j0c28   => "--:-:-:-:1      ISETP.GE.AND P1, PT, loopN, 32, P0;\n",
+
+        j0c34   => "--:-:-:-:1  \@P0 $convert storeX7, load0I7.H1;\n",
+        j0c38   => "--:-:-:-:1  \@P0 $convert storeX6, load0I7.H0;\n",
+        j0c42   => "--:-:-:-:1  \@P0 $convert storeX5, load0I6.H1;\n",
+        j0c46   => "--:-:-:-:1  \@P0 $convert storeX4, load0I6.H0;\n",
+        j0c50   => "--:-:-:-:1  \@P0 $convert storeX3, load0I5.H1;\n",
+        j0c54   => "--:-:-:-:1  \@P0 $convert storeX2, load0I5.H0;\n",
+        j0c58   => "--:-:-:-:1  \@P0 $convert storeX1, load0I4.H1;\n",
+        j0c62   => "--:-:-:-:1  \@P0 $convert storeX0, load0I4.H0;\n",
+
+        j1c34   => "02:-:-:-:1 \@!P0 $convert storeX7, load0I3.H1;\n",
+        j1c38   => "--:-:-:-:1 \@!P0 $convert storeX6, load0I3.H0;\n",
+        j1c42   => "--:-:-:-:1 \@!P0 $convert storeX5, load0I2.H1;\n",
+        j1c46   => "--:-:5:-:1 \@!P0 $convert storeX4, load0I2.H0;\n",
+        j1c50   => "--:-:-:-:1 \@!P0 $convert storeX3, load0I1.H1;\n",
+        j1c54   => "--:-:-:-:1 \@!P0 $convert storeX2, load0I1.H0;\n",
+        j1c58   => "--:-:-:-:1 \@!P0 $convert storeX1, load0I0.H1;\n",
+        j1c62   => "--:-:2:-:1 \@!P0 $convert storeX0, load0I0.H0;\n",
+
+        j2c8    => "10:-:-:-:1      STS [writeIs + 4x<7*128 +  0>], storeX7;\n",
+        j2c10   => "--:-:-:-:1      STS [writeIs + 4x<6*128 +  0>], storeX6;\n",
+        j2c12   => "--:-:-:-:1      STS [writeIs + 4x<5*128 +  0>], storeX5;\n",
+        j2c14   => "--:-:-:-:1      STS [writeIs + 4x<4*128 +  0>], storeX4;\n",
+        j2c16   => "02:-:-:-:1      STS [writeIs + 4x<3*128 +  0>], storeX3;\n",
+        j2c18   => "--:-:-:-:1      STS [writeIs + 4x<2*128 +  0>], storeX2;\n",
+        j2c20   => "--:-:-:-:1      STS [writeIs + 4x<1*128 +  0>], storeX1;\n",
+        j2c22   => "--:2:-:-:1      STS [writeIs + 4x<0*128 +  0>], storeX0;\n",
+
+        j2c24   => "--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, P1;\n",
+        j2c26   => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track0I0, -1, P1;\n",
+
+        j3c8    => "02:-:-:-:1  \@P2 LDG.E.CI.128 load0I0, [track0I + 2x< 0>];\n",
+        j3c10   => "--:5:2:-:1  \@P2 LDG.E.CI.128 load0I4, [track0I + 2x<16>];\n",
+
+        j4c8    => "--:-:-:-:1  \@P3 LDS.U.128 load0I0, [addr_zero];\n",
+        j5c8    => "--:-:-:-:1  \@P3 LDS.U.128 load0I4, [addr_zero];\n",
+
+        j5c57   => "10:-:-:-:1  \@P2 IADD   track0I0.CC, track0I0, 2x<32>;\n",
+        j5c63   => "--:-:-:-:1  \@P2 IADD.X track0I1,    track0I1, RZ;\n",
+
+
+        j5c34   => "--:-:-:-:1  \@P0 $convert storeX7, load1I7.H1;\n",
+        j5c38   => "--:-:-:-:1  \@P0 $convert storeX6, load1I7.H0;\n",
+        j5c42   => "--:-:-:-:1  \@P0 $convert storeX5, load1I6.H1;\n",
+        j5c46   => "--:-:-:-:1  \@P0 $convert storeX4, load1I6.H0;\n",
+        j5c50   => "--:-:-:-:1  \@P0 $convert storeX3, load1I5.H1;\n",
+        j5c54   => "--:-:-:-:1  \@P0 $convert storeX2, load1I5.H0;\n",
+        j5c58   => "--:-:-:-:1  \@P0 $convert storeX1, load1I4.H1;\n",
+        j5c62   => "--:-:-:-:1  \@P0 $convert storeX0, load1I4.H0;\n",
+
+        j6c34   => "04:-:-:-:1 \@!P0 $convert storeX7, load1I3.H1;\n",
+        j6c38   => "--:-:-:-:1 \@!P0 $convert storeX6, load1I3.H0;\n",
+        j6c42   => "--:-:-:-:1 \@!P0 $convert storeX5, load1I2.H1;\n",
+        j6c46   => "--:-:5:-:1 \@!P0 $convert storeX4, load1I2.H0;\n",
+        j6c50   => "--:-:-:-:1 \@!P0 $convert storeX3, load1I1.H1;\n",
+        j6c54   => "--:-:-:-:1 \@!P0 $convert storeX2, load1I1.H0;\n",
+        j6c58   => "--:-:-:-:1 \@!P0 $convert storeX1, load1I0.H1;\n",
+        j6c62   => "--:-:3:-:1 \@!P0 $convert storeX0, load1I0.H0;\n",
+
+        j7c8    => "10:-:-:-:1      STS [writeIs + 4x<7*128 + 64>], storeX7;\n",
+        j7c10   => "--:-:-:-:1      STS [writeIs + 4x<6*128 + 64>], storeX6;\n",
+        j7c12   => "--:-:-:-:1      STS [writeIs + 4x<5*128 + 64>], storeX5;\n",
+        j7c14   => "--:-:-:-:1      STS [writeIs + 4x<4*128 + 64>], storeX4;\n",
+        j7c16   => "04:-:-:-:1      STS [writeIs + 4x<3*128 + 64>], storeX3;\n",
+        j7c18   => "--:-:-:-:1      STS [writeIs + 4x<2*128 + 64>], storeX2;\n",
+        j7c20   => "--:-:-:-:1      STS [writeIs + 4x<1*128 + 64>], storeX1;\n",
+        j7c22   => "--:3:-:-:1      STS [writeIs + 4x<0*128 + 64>], storeX0;\n",
+
+        j7c24   => "--:-:-:-:1      ISETP.NE.AND P2, PT, track1I0, -1, P1;\n",
+        j7c26   => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track1I0, -1, P1;\n",
+
+        j8c8    => "04:-:-:-:1  \@P2 LDG.E.CI.128 load1I0, [track1I + 2x< 0>];\n",
+        j8c10   => "--:5:3:-:1  \@P2 LDG.E.CI.128 load1I4, [track1I + 2x<16>];\n",
+
+        j9c8    => "--:-:-:-:1  \@P3 LDS.U.128 load1I0, [addr_zero];\n",
+        j10c8   => "--:-:-:-:1  \@P3 LDS.U.128 load1I4, [addr_zero];\n",
+
+        j10c57  => "10:-:-:-:1  \@P2 IADD   track1I0.CC, track1I0, 2x<32>;\n",
+        j10c63  => "--:-:-:-:1  \@P2 IADD.X track1I1,    track1I1, RZ;\n",
+
+
+        j10c34  => "--:-:-:-:1  \@P0 $convert storeX7, loadE7.H1;\n",
+        j10c38  => "--:-:-:-:1  \@P0 $convert storeX6, loadE7.H0;\n",
+        j10c42  => "--:-:-:-:1  \@P0 $convert storeX5, loadE6.H1;\n",
+        j10c46  => "--:-:-:-:1  \@P0 $convert storeX4, loadE6.H0;\n",
+        j10c50  => "--:-:-:-:1  \@P0 $convert storeX3, loadE5.H1;\n",
+        j10c54  => "--:-:-:-:1  \@P0 $convert storeX2, loadE5.H0;\n",
+        j10c58  => "--:-:-:-:1  \@P0 $convert storeX1, loadE4.H1;\n",
+        j10c62  => "--:-:-:-:1  \@P0 $convert storeX0, loadE4.H0;\n",
+
+        j11c34  => "08:-:-:-:1 \@!P0 $convert storeX7, loadE3.H1;\n",
+        j11c38  => "--:-:-:-:1 \@!P0 $convert storeX6, loadE3.H0;\n",
+        j11c42  => "--:-:-:-:1 \@!P0 $convert storeX5, loadE2.H1;\n",
+        j11c46  => "--:-:5:-:1 \@!P0 $convert storeX4, loadE2.H0;\n",
+        j11c50  => "--:-:-:-:1 \@!P0 $convert storeX3, loadE1.H1;\n",
+        j11c54  => "--:-:-:-:1 \@!P0 $convert storeX2, loadE1.H0;\n",
+        j11c58  => "--:-:-:-:1 \@!P0 $convert storeX1, loadE0.H1;\n",
+        j11c62  => "--:-:4:-:1 \@!P0 $convert storeX0, loadE0.H0;\n",
+
+        j12c8   => "10:-:-:-:1      STS [writeEs + 4x<7*64>], storeX7;\n",
+        j12c10  => "--:-:-:-:1      STS [writeEs + 4x<6*64>], storeX6;\n",
+        j12c12  => "--:-:-:-:1      STS [writeEs + 4x<5*64>], storeX5;\n",
+        j12c14  => "--:-:-:-:1      STS [writeEs + 4x<4*64>], storeX4;\n",
+        j12c16  => "08:-:-:-:1      STS [writeEs + 4x<3*64>], storeX3;\n",
+        j12c18  => "--:-:-:-:1      STS [writeEs + 4x<2*64>], storeX2;\n",
+        j12c20  => "--:-:-:-:1      STS [writeEs + 4x<1*64>], storeX1;\n",
+        j12c22  => "--:4:-:-:1      STS [writeEs + 4x<0*64>], storeX0;\n",
+
+        j12c24  => "--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K,  P1;\n",
+
+        j13c8   => "08:-:-:-:1  \@P2 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];\n",
+        j13c10  => "--:5:4:-:1  \@P2 LDG.E.CI.128 loadE4, [trackE + 2x<16>];\n",
+
+        j15c57  => "10:-:-:-:1  \@P2 IADD   trackE0.CC, trackE0, 2x<32>;\n",
+        j15c62  => "--:-:-:-:1  \@P2 IADD.X trackE1,    trackE1, RZ;\n",
+
+        # p0 = N >= 16 and not (N == 32 and (p or q))
+        j14c8   => "--:-:-:-:1      ISETP.EQ.AND  P0, PT, loopN, 32, PT;\n",
+        j14c10  => "--:-:-:-:1      ISETP.GE.AND  P1, PT, loopN, 16, PT;\n",
+        j14c22  => "--:-:-:-:1      PSETP.OR.AND  P0, PT, P5, P6, P0;\n",
+        j14c35  => "--:-:-:-:1      PSETP.AND.AND P0, PT, !P0, P1, PT;\n",
+
+        j14c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                   "20:-:-:-:1      IADD readEs,  readEs, -swapBuf;\n" .
+                   "--:-:-:-:1      IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1      IADD writeEs, writeEs, swapBuf;\n" .
+                   "--:-:-:-:1      IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j15c63  => "--:-:-:Y:5  \@P0 BRA.U NEXT_16N;\n" .
+                   "--:-:-:-:0  \@P5 IADD q, q, param_grid_Q;\n" .
+                   "01:-:-:Y:5  \@P5 BRA.U NEXT_PQ;\n" .
+                   "--:-:-:-:1  \@P6 MOV  q, qq;\n" .
+                   "--:-:-:-:0  \@P6 IADD p, p, param_grid_P;\n" .
+                   "--:-:-:Y:5  \@P6 BRA.U NEXT_PQ;\n" .
+                   "--:-:-:Y:5      BRA.U FINISH;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 15)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = 1 - $odd;
+        my $rsOffset = ($j + 1) & 15;
+        my $rsPred   = $j == 15 ? '@P0' : '   ';
+        my $shift    = $rsOffset < 8 ? 0 : 1;
+        my $barrier  = $j == 14 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64  + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64  + 32 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|F2F|I2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
++]
+
+FIRST_LOAD:
+
+--:-:-:-:8      PSETP.AND.AND P0, PT, PT, PT, !PT;
+
+--:-:-:-:1  @P2 LDG.E.CI.128 load0I0, [track0I + 2x< 0>];
+--:-:1:-:1  @P2 LDG.E.CI.128 load0I4, [track0I + 2x<16>];
+--:-:-:-:1 @!P2 LDS.U.128    load0I0, [addr_zero];
+--:-:4:-:1 @!P2 LDS.U.128    load0I4, [addr_zero];
+
+// p1 = N == 32 and (p or q)
+--:-:-:-:0      ISETP.EQ.AND  P1, PT, loopN, 32, PT;
+
+--:-:-:-:1  @P3 LDG.E.CI.128 load1I0, [track1I + 2x< 0>];
+--:-:2:-:1  @P3 LDG.E.CI.128 load1I4, [track1I + 2x<16>];
+--:-:-:-:1 @!P3 LDS.U.128    load1I0, [addr_zero];
+--:-:5:-:1 @!P3 LDS.U.128    load1I4, [addr_zero];
+
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE0, [trackE + 2x< 0>];
+--:-:3:-:1  @P4 LDG.E.CI.128 loadE4, [trackE + 2x<16>];
+--:-:-:-:1 @!P4 LDS.U.128    loadE0, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128    loadE4, [addr_zero];
+
+
+09:-:-:-:1      [+ convert() +] storeX7, load0I3.H1;
+--:-:-:-:1      [+ convert() +] storeX6, load0I3.H0;
+--:-:-:-:1      [+ convert() +] storeX5, load0I2.H1;
+--:-:1:-:1      [+ convert() +] storeX4, load0I2.H0;
+--:-:-:-:1      [+ convert() +] storeX3, load0I1.H1;
+--:-:-:-:1      [+ convert() +] storeX2, load0I1.H0;
+--:-:-:-:1      [+ convert() +] storeX1, load0I0.H1;
+--:-:4:-:1      [+ convert() +] storeX0, load0I0.H0;
+
+--:-:-:-:1      PSETP.OR.AND  P1, PT, P5, P6, P1;
+--:-:-:-:0  @P2 IADD   track0I0.CC, track0I0, 2x<32>;
+
+01:-:-:-:1      STS [writeIs + 4x<7*128 +  0>], storeX7;
+--:-:-:-:1      STS [writeIs + 4x<6*128 +  0>], storeX6;
+--:-:-:-:1      STS [writeIs + 4x<5*128 +  0>], storeX5;
+--:-:-:-:1      STS [writeIs + 4x<4*128 +  0>], storeX4;
+08:-:-:-:1      STS [writeIs + 4x<3*128 +  0>], storeX3;
+--:-:-:-:1      STS [writeIs + 4x<2*128 +  0>], storeX2;
+--:-:-:-:1      STS [writeIs + 4x<1*128 +  0>], storeX1;
+--:1:-:-:2      STS [writeIs + 4x<0*128 +  0>], storeX0;
+
+--:-:-:-:0  @P2 IADD.X track0I1,    track0I1, RZ;
+
+13:-:-:-:1      [+ convert() +] storeX7, load1I3.H1;
+--:-:-:-:1      [+ convert() +] storeX6, load1I3.H0;
+--:-:-:-:1      [+ convert() +] storeX5, load1I2.H1;
+--:-:2:-:1      [+ convert() +] storeX4, load1I2.H0;
+--:-:-:-:1      [+ convert() +] storeX3, load1I1.H1;
+--:-:-:-:1      [+ convert() +] storeX2, load1I1.H0;
+--:-:-:-:1      [+ convert() +] storeX1, load1I0.H1;
+--:-:5:-:1      [+ convert() +] storeX0, load1I0.H0;
+
+--:-:-:-:1      PSETP.AND.AND P5, PT, P5, P1, PT;
+--:-:-:-:0  @P3 IADD   track1I0.CC, track1I0, 2x<32>;
+
+02:-:-:-:1      STS [writeIs + 4x<7*128 + 64>], storeX7;
+--:-:-:-:1      STS [writeIs + 4x<6*128 + 64>], storeX6;
+--:-:-:-:1      STS [writeIs + 4x<5*128 + 64>], storeX5;
+--:-:-:-:1      STS [writeIs + 4x<4*128 + 64>], storeX4;
+10:-:-:-:1      STS [writeIs + 4x<3*128 + 64>], storeX3;
+--:-:-:-:1      STS [writeIs + 4x<2*128 + 64>], storeX2;
+--:-:-:-:1      STS [writeIs + 4x<1*128 + 64>], storeX1;
+--:1:-:-:1      STS [writeIs + 4x<0*128 + 64>], storeX0;
+
+--:-:-:-:1      PSETP.AND.AND P6, PT, P6, P1, PT;
+--:-:-:-:0  @P3 IADD.X track1I1,    track1I1, RZ;
+
+25:-:-:-:1      [+ convert() +] storeX7, loadE3.H1;
+--:-:-:-:1      [+ convert() +] storeX6, loadE3.H0;
+--:-:-:-:1      [+ convert() +] storeX5, loadE2.H1;
+--:-:3:-:1      [+ convert() +] storeX4, loadE2.H0;
+--:-:-:-:1      [+ convert() +] storeX3, loadE1.H1;
+--:-:-:-:1      [+ convert() +] storeX2, loadE1.H0;
+--:-:-:-:1      [+ convert() +] storeX1, loadE0.H1;
+--:-:6:-:1      [+ convert() +] storeX0, loadE0.H0;
+
+--:-:-:-:0  @P4 IADD   trackE0.CC, trackE0, 2x<32>;
+
+04:-:-:-:1      STS [writeEs + 4x<7*64>], storeX7;
+--:-:-:-:1      STS [writeEs + 4x<6*64>], storeX6;
+--:-:-:-:1      STS [writeEs + 4x<5*64>], storeX5;
+--:-:-:-:1      STS [writeEs + 4x<4*64>], storeX4;
+20:-:-:-:1      STS [writeEs + 4x<3*64>], storeX3;
+--:-:-:-:1      STS [writeEs + 4x<2*64>], storeX2;
+--:-:-:-:1      STS [writeEs + 4x<1*64>], storeX1;
+--:1:-:-:1      STS [writeEs + 4x<0*64>], storeX0;
+
+--:-:-:-:1  @P4 IADD.X trackE1,    trackE1, RZ;
+
+--:-:-:-:1      IADD readEs,  readEs, -swapBuf;
+--:-:-:-:0      IADD readIs,  readIs, -swapBuf;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:0  @P5 IADD q, q, param_grid_Q;
+--:-:-:Y:5  @P5 BRA.U NEXT_PQ;
+--:-:-:-:0  @P6 IADD p, p, param_grid_P;
+--:-:-:Y:5  @P6 BRA.U NEXT_PQ;
+
+--:-:-:-:2      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:Y:5      BRA.U INIT_LOOP;
+
+
+FINISH:
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID2;
+--:-:2:-:1      S2R blkI,    SR_CTAID.Y;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Z;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.X;
+--:-:-:-:5      BRA.U END_CTAID2;
+CTAID2:
+--:-:2:-:1      S2R blkI,    SR_CTAID.X;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Y;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.Z;
+END_CTAID2:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readEs,  readEs, -4x<szShareI>;
+--:-:-:-:1  @P0 IADD readIs,  readIs, -swapBuf;
+--:-:-:-:1  @P0 IADD readEs,  readEs, -swapBuf;
+
+// writeCs = (readIs / 4) * 64 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 4;
+
+
+// readCs = ((tid & 96) << 3) | (tid & 31)
+01:-:-:-:1      LOP.AND tid31, tid, 31;
+01:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 3;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+
+// kk = blkE*64 + tid31;
+04:-:-:-:1      ISCADD kk, blkE, tid31, 6;
+--:-:-:-:1      IADD   kk, kk, param_offset_K;
+
+// crst = blkI*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 crst00, tid96,  1;
+02:-:-:-:1      ISCADD  crst00, blkI,   crst00, 7;
+--:-:-:-:1      IADD    crst04, crst00, 4;
+--:-:-:-:1      IADD    crst08, crst00, 8;
+--:-:-:-:1      IADD    crst12, crst00, 12;
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1, K, 2;
+--:-:-:-:1      SHL K4, K, 4;
+--:-:-:-:1      ISCADD K60, K, -K4, 8;
+
+// trackF += crst*K + k;
+--:-:-:-:1      VMAD.U16.U16 tf, crst00, K, kk;
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSTK, param_CRSTK;
+08:-:-:-:1      XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ;
+        };
+    }
+    return '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 2;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P5, PT, kk, param_K, PT;
+--:-:-:-:1      IADD kk, kk, 32;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, PT;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   track04F0.CC, track00F0, K4;
+--:-:-:-:1      IADD.X track04F1,    track00F1, RZ;
+--:-:-:-:6      IADD   track08F0.CC, track04F0, K4;
+--:-:-:-:1      IADD.X track08F1,    track04F1, RZ;
+--:-:-:-:6      IADD   track12F0.CC, track08F0, K4;
+--:-:-:-:1      IADD.X track12F1,    track08F1, RZ;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   track00F0.CC, track00F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst00,       crst00,     60;\n" .
+            "--:-:-:-:1      IADD.X track00F1,    track00F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track04F0.CC, track04F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst04,       crst04,     60;\n" .
+            "--:-:-:-:1      IADD.X track04F1,    track04F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track08F0.CC, track08F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst08,       crst08,     60;\n" .
+            "--:-:-:-:1      IADD.X track08F1,    track08F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track12F0.CC, track12F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst12,       crst12,     60;\n" .
+            "--:-:-:-:1      IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL f0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL f7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
+--:-:-:-:1      IADD         crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
+--:-:-:-:1      IADD         crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
+--:-:-:-:1      IADD         crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
+--:-:-:-:0      IADD         crst12, crst12, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], f0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], f4;
+
+--:-:1:-:1      LDS f0, [readCs + 4x<0*64 + 00>];
+--:-:2:-:1      LDS f2, [readCs + 4x<1*64 + 00>];
+--:-:3:-:1      LDS f4, [readCs + 4x<2*64 + 00>];
+--:-:4:-:1      LDS f6, [readCs + 4x<3*64 + 00>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:-:-:-:1  @P0 STG.E.CG [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 STG.E.CG [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 STG.E.CG [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 STG.E.CG [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
+    else
+    {
+        return q{
+01:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
++]
+
+--:-:1:-:1      LDS f1, [readCs + 4x<0*64 + 32>];
+--:-:2:-:1      LDS f3, [readCs + 4x<1*64 + 32>];
+--:-:3:-:1      LDS f5, [readCs + 4x<2*64 + 32>];
+--:-:4:-:1      LDS f7, [readCs + 4x<3*64 + 32>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:1:-:-:1  @P0 STG.E.CG [track00F + 4x<32>], f1;
+02:2:-:-:1  @P1 STG.E.CG [track04F + 4x<32>], f3;
+04:3:-:-:1  @P2 STG.E.CG [track08F + 4x<32>], f5;
+08:4:-:-:1  @P3 STG.E.CG [track12F + 4x<32>], f7;
+        };
+    }
+    else
+    {
+        return q{
+01:1:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<32>], f1;
+02:2:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<32>], f3;
+04:3:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<32>], f5;
+08:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<32>], f7;
+        };
+    }
++]
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X128_N128.sass b/Kernel/Convolution/Pascal/hconv_xprop_X128_N128.sass
new file mode 100644
index 0000000..71bae4b
--- /dev/null
+++ b/Kernel/Convolution/Pascal/hconv_xprop_X128_N128.sass
@@ -0,0 +1,261 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix = 'h';
+    our $shareI = 128;
+    our $shareF = 128;
+    our $stepI  = 32;
+    our $stepF  = 64;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*8*2 + 128*8*2 + 0>
+    szShareF  : (128*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<128*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<128*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<128*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<128*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<128*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<128*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<128*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<128*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne
+     72-111 ~ tid1, tid128, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+
+    100-103 : loadI<0-1>, loadF<0-1>
+    104-107 : storeI<0-3>
+    104-107 : storeF<0-3>
+
+    108-111 ~ offsetF, offsetI, offsetFc, offsetIc
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    114-122 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset
+    123-127 ~ readFs, readIs, tid, idx_N
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-122  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidX = (tid & 31) << 2
+// tidY = tid >> 5
+--:-:-:-:1      LOP.AND tidX, tid,  31;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  5;
+
+// trackF += blkF*128 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 7;
+
+// trackI += blkI*128 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 7;
+
+// writeS = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+// readFs = ((tid & 112) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    112;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7)
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      SHR.U32 tid128, tid128, 3;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid128;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.64    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.64    loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.64    loadI, [addr_zero];
+
+12:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:2:-:2      [+ convert() +] storeF0, loadF0.H0;
+
+02:1:-:-:2      STS.128 [writeS], storeF;
+
+25:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:2:-:2      [+ convert() +] storeI0, loadI0.H0;
+
+02:1:-:-:1      STS.128 [writeS + 4x<szShareF>], storeI;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:3:-:1  @P1 LDG.E.64    loadI, [trackI];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c33 => "02:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c41 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c45 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c60 => "02:2:-:-:1  \@P0 STS.128 [writeS], storeF;\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "20:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c40 => "02:-:2:-:1  \@P1 LDG.E.CI.64 loadF, [trackF];\n",
+
+
+        j5c45 => "04:-:-:-:1  \@P0 $convert storeI3, loadI1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert storeI2, loadI1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert storeI1, loadI0.H1;\n",
+        j5c57 => "--:-:3:-:1  \@P0 $convert storeI0, loadI0.H0;\n",
+
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<szShareF>], storeI;\n",
+
+        j6c54 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j6c59 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j6c61 => "04:-:3:-:1  \@P1 LDG.E.64 loadI, [trackI];\n",
+
+        j6c62 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<szShareF + szShareI>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<szShareF + szShareI>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3 + (tid & 128) >> 1
+// tidOY = (tid & 127) >> 3
+--:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      SHL     tidOX,  tidOX,  3;
+--:-:-:-:1      LOP.AND tidOX2, tid,    128;
+--:-:-:-:1      SHR.U32 tidOX2, tidOX2, 1;
+--:-:-:-:1      LOP.OR  tidOX,  tidOX,  tidOX2;
+--:-:-:-:1      LOP.AND tidOY,  tid,    127;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      LOP.AND readIs, readIs, 0x1ff;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x0ff;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+--:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*128 + tidOY * 4
+--:-:-:-:1      SHL tidOY, tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 7;
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X128_N64.sass b/Kernel/Convolution/Pascal/hconv_xprop_X128_N64.sass
new file mode 100644
index 0000000..ce64717
--- /dev/null
+++ b/Kernel/Convolution/Pascal/hconv_xprop_X128_N64.sass
@@ -0,0 +1,284 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix = 'h';
+    our $shareI = 64;
+    our $shareF = 128;
+    our $stepI  = 32;
+    our $stepF  = 64;
+    our $remapF = 1;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 128*8*2 + 0>
+    szShareF  : (128*8)
+    szShareI  : (64*8)
+
+    addr_zero  : 4x<64*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-109 ~ tid1, tid15, tidFX, tidIX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-109 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+
+    100-103 : loadF<0-3>
+    100-103 : storeF<0-3>
+    104-107 : storeF<4-7>
+
+    108-109 : loadI<0-1>
+    104-107 : storeI<0-3>
+
+    104-107 ~ offsetF
+
+    110-111 : sliceI, sliceF
+    110-111 : sliceIF<0-1>
+
+    112-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc
+    125-127 ~ readFs, readIs, swapBuf
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-124  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidFX = (tid & 15) << 3
+// tidIX = (tid & 15) << 2
+// tidY = tid >> 4
+--:-:-:-:1      LOP.AND tid15, tid,  15;
+--:-:-:-:1      SHL     tidFX, tid15, 3;
+--:-:-:-:1      SHL     tidIX, tid15, 2;
+--:-:-:-:1      SHR.U32 tidY,  tid,   4;
+
+// trackF += blkF*128 + tidFX + offset_K
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidFX, 7;
+
+// trackI += blkI*64 + tidIX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidIX, 6;
+
+// Remap the FX dim to avoid bank conflicts when storing to shared
+
+// writeFs = (128*tidY + tidIX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidIX, 7;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// writeIs = (64*tidY + tidIX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidIX, 6;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs = ((tid & -16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = (tid >> 1) & 7
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.128    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.64 loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.64 loadI, [addr_zero];
+
+12:-:-:-:1      [+ convert() +] storeF7, loadF3.H1;
+--:-:-:-:1      [+ convert() +] storeF6, loadF3.H0;
+--:-:-:-:1      [+ convert() +] storeF5, loadF2.H1;
+--:-:1:-:1      [+ convert() +] storeF4, loadF2.H0;
+--:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:2:-:1      [+ convert() +] storeF0, loadF0.H0;
+
+01:-:-:-:1      STS.128 [writeFs + 4x<64>], storeF4;
+02:1:-:-:2      STS.128 [writeFs + 4x<00>], storeF0;
+
+25:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:3:-:2      [+ convert() +] storeI0, loadI0.H0;
+
+04:1:-:-:1      STS.128 [writeIs], storeI0;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF, [trackF];
+--:-:3:-:1  @P1 LDG.E.64     loadI, [trackI];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c29 => "02:-:-:-:1  \@P0 $convert storeF7, loadF3.H1;\n",
+        j1c33 => "--:-:-:-:1  \@P0 $convert storeF6, loadF3.H0;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF5, loadF2.H1;\n",
+        j1c41 => "--:-:5:-:1  \@P0 $convert storeF4, loadF2.H0;\n",
+        j1c45 => "--:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c49 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c53 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c57 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c59 => "10:5:-:-:1  \@P0 STS.128 [writeFs + 4x<64>], storeF4;\n",
+        j2c8  => "02:2:-:-:1  \@P0 STS.128 [writeFs + 4x<00>], storeF0;\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "30:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c40 => "02:-:2:-:1  \@P1 LDG.E.CI.128 loadF, [trackF];\n",
+
+        j5c45 => "04:-:-:-:1  \@P0 $convert storeI3, loadI1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert storeI2, loadI1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert storeI1, loadI0.H1;\n",
+        j5c57 => "--:-:3:-:1  \@P0 $convert storeI0, loadI0.H0;\n",
+
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeIs], storeI0;\n",
+
+        j6c55 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j6c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j6c62 => "04:-:3:-:1  \@P1 LDG.E.64 loadI, [trackI];\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "--:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3
+// tidOY = tid >> 3
+02:-:-:-:1      LOP.AND tidOX,  tid,   7;
+--:-:-:-:1      SHL     tidOX,  tidOX, 3;
+--:-:-:-:1      SHR.U32 tidOY,  tid,   3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 64 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs  = 4 * (tidOX + (tidOY * 64))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// Mul by 2 again to undo the bank conflict avoiding stride
+// k = blkF*128 + tidOY * 8
+--:-:-:-:1      SHL    tidOY,   tidOY, 3;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 7;
+
+[+ output_setup(63, 0, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X32_N128.sass b/Kernel/Convolution/Pascal/hconv_xprop_X32_N128.sass
new file mode 100644
index 0000000..e85f7d4
--- /dev/null
+++ b/Kernel/Convolution/Pascal/hconv_xprop_X32_N128.sass
@@ -0,0 +1,323 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix  = 'h';
+    our $shareI  = 128;
+    our $shareF  = 32;
+    our $stepI   = 32;
+    our $stepF   = 16;
+    our $remapI  = 1;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<32*8*2 + 128*8*2 + 0>
+    szShareF  : (32*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<32*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<32*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<32*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<32*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<32*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<32*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<32*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<32*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-69 : m, p, q
+      64-69 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne
+     70-113 ~ tid1, tidIX, tidFX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     70-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+
+    100-103 : load0I<0-3>
+    100-103 : store0I<0-3>
+    104-107 : store0I<4-7>
+
+    108-111 : load1I<0-3>
+    108-111 : store1I<0-3>
+    104-107 : store1I<4-7>
+
+    112-113 : loadF<0-1>
+    104-107 : storeF<0-3>
+
+    114-115 : sliceI, sliceF
+    114-115 : sliceIF<0-1>
+
+    116-140 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetF, offsetIc, offsetFc
+    141-155 ~ readFs, readIs, swapBuf, tid, idx_N, tid7, tid1_7, tid32, tid32_1
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-140  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+
+[+ get_mpqk() +]
+
+// tidIX = (tid & 7) << 3
+// tidFX = (tid & 7) << 2
+
+// tidY = tid >> 3
+--:-:-:-:1      LOP.AND tid7,  tid,  7;
+--:-:-:-:1      SHL     tidIX, tid7, 3;
+--:-:-:-:1      SHL     tidFX, tid7, 2;
+--:-:-:-:1      SHR.U32 tidY,  tid,  3;
+
+// trackF += blkF*32 + tidFX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidFX, 5;
+
+// trackI += blkI*128 + tidIX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidIX, 7;
+
+// writeFs = (32*tidY + tidFX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidFX, 5;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// Remap the IX dim to avoid bank conflicts when storing to shared
+
+// writeIs = (128*tidY + tidFX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidFX, 7;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs  = (((tid & 16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 32) >> 1) | ((tid >> 1) & 7) << 4
+--:-:-:-:1      LOP.AND tid32,   tid,   32;
+--:-:-:-:1      SHR.U32 tid32_1, tid32, 1;
+--:-:-:-:1      BFE.U32 tid1_7,  tid,   0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, tid1_7, tid32_1;
+--:-:-:-:1      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.64    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128 load0I, [trackI + 2x<00>];
+--:-:4:-:1  @P1 LDG.E.128 load1I, [trackI + 2x<64>];
+--:-:-:-:1 @!P1 LDS.U.128 load0I, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 load1I, [addr_zero];
+
+12:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:2:-:2      [+ convert() +] storeF0, loadF0.H0;
+
+02:1:-:-:2      STS.128 [writeFs], storeF0;
+
+25:-:-:-:1      [+ convert() +] store0I7, load0I3.H1;
+--:-:-:-:1      [+ convert() +] store0I6, load0I3.H0;
+--:-:-:-:1      [+ convert() +] store0I5, load0I2.H1;
+--:-:2:-:1      [+ convert() +] store0I4, load0I2.H0;
+--:-:-:-:1      [+ convert() +] store0I3, load0I1.H1;
+--:-:-:-:1      [+ convert() +] store0I2, load0I1.H0;
+--:-:-:-:1      [+ convert() +] store0I1, load0I0.H1;
+--:-:3:-:1      [+ convert() +] store0I0, load0I0.H0;
+
+02:-:-:-:1      STS.128 [writeIs + 4x<32>], store0I4;
+04:1:-:-:2      STS.128 [writeIs + 4x<00>], store0I0;
+
+09:-:-:-:1      [+ convert() +] store1I7, load1I3.H1;
+--:-:-:-:1      [+ convert() +] store1I6, load1I3.H0;
+--:-:-:-:1      [+ convert() +] store1I5, load1I2.H1;
+--:-:2:-:1      [+ convert() +] store1I4, load1I2.H0;
+--:-:-:-:1      [+ convert() +] store1I3, load1I1.H1;
+--:-:-:-:1      [+ convert() +] store1I2, load1I1.H0;
+--:-:-:-:1      [+ convert() +] store1I1, load1I0.H1;
+--:-:3:-:1      [+ convert() +] store1I0, load1I0.H0;
+
+02:-:-:-:1      STS.128 [writeIs + 4x<96>], store1I4;
+04:1:-:-:1      STS.128 [writeIs + 4x<64>], store1I0;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.64 loadF,  [trackF];
+--:-:3:-:1  @P1 LDG.E.128   load0I, [trackI + 2x<00>];
+--:5:4:-:1  @P1 LDG.E.128   load1I, [trackI + 2x<64>];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c33 => "02:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c41 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c45 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c60 => "02:-:-:-:1  \@P0 STS.128 [writeFs], storeF0;\n",
+
+        j1c62 => "--:-:2:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "02:-:-:-:1  \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.64 loadF0, [trackF];\n",
+
+
+        j3c29 => "04:-:-:-:1  \@P0 $convert store0I7, load0I3.H1;\n",
+        j3c33 => "--:-:-:-:1  \@P0 $convert store0I6, load0I3.H0;\n",
+        j3c37 => "--:-:-:-:1  \@P0 $convert store0I5, load0I2.H1;\n",
+        j3c41 => "--:-:6:-:1  \@P0 $convert store0I4, load0I2.H0;\n",
+        j3c45 => "--:-:-:-:1  \@P0 $convert store0I3, load0I1.H1;\n",
+        j3c49 => "--:-:-:-:1  \@P0 $convert store0I2, load0I1.H0;\n",
+        j3c53 => "--:-:-:-:1  \@P0 $convert store0I1, load0I0.H1;\n",
+        j3c57 => "--:-:3:-:1  \@P0 $convert store0I0, load0I0.H0;\n",
+
+        j3c59 => "20:-:-:-:1  \@P0 STS.128 [writeIs + 4x<32>], store0I4;\n",
+        j4c8  => "04:3:-:-:1  \@P0 STS.128 [writeIs + 4x<00>], store0I0;\n",
+
+        j4c50 => "10:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j4c55 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j4c61 => "04:-:3:-:1  \@P1 LDG.E.128 load0I0, [trackI + 2x<00>];\n",
+
+
+        j5c29 => "08:-:-:-:1  \@P0 $convert store1I7, load1I3.H1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 $convert store1I6, load1I3.H0;\n",
+        j5c37 => "--:-:-:-:1  \@P0 $convert store1I5, load1I2.H1;\n",
+        j5c41 => "--:-:6:-:1  \@P0 $convert store1I4, load1I2.H0;\n",
+        j5c45 => "--:-:-:-:1  \@P0 $convert store1I3, load1I1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert store1I2, load1I1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert store1I1, load1I0.H1;\n",
+        j5c57 => "--:-:4:-:1  \@P0 $convert store1I0, load1I0.H0;\n",
+
+        j5c59 => "20:-:-:-:1  \@P0 STS.128 [writeIs + 4x<96>], store1I4;\n",
+        j6c8  => "08:4:-:-:1  \@P0 STS.128 [writeIs + 4x<64>], store1I0;\n",
+
+        j6c61 => "08:5:4:-:1  \@P1 LDG.E.128 load1I0, [trackI + 2x<64>];\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "--:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3 + (tid & 32) << 1
+// tidOY = (tid & 31) >> 3
+--:-:-:-:1      SHL     tid32,  tid32, 1;
+--:-:-:-:1      ISCADD  tidOX,  tid7,  tid32, 3;
+--:-:-:-:1      LOP.AND tidOY,  tid,   31;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY, 3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+
+// readIs = ((tid & 32) >> 1) | (((tid >> 1) & 7) << 1) << 4
+--:-:-:-:1      ISCADD readIs, tid1_7, tid32_1, 1;
+--:-:-:-:1      SHL    readIs, readIs, 4;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+--:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*32 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+--:-:-:-:1      ISCADD k, idx_K, tidOY, 5;
+
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X64_N128.sass b/Kernel/Convolution/Pascal/hconv_xprop_X64_N128.sass
new file mode 100644
index 0000000..38f8183
--- /dev/null
+++ b/Kernel/Convolution/Pascal/hconv_xprop_X64_N128.sass
@@ -0,0 +1,293 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix = 'h';
+    our $shareI = 128;
+    our $shareF = 64;
+    our $stepI  = 64;
+    our $stepF  = 32;
+    our $remapI = 1;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 128*8*2 + 0>
+    szShareF  : (64*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<64*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-111 ~ tid1, tid15, tid64, tidIX, tidFX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+
+    100-103 : loadI<0-3>
+    100-103 : storeI<0-3>
+    104-107 : storeI<4-7>
+
+    108-109 : loadF<0-1>
+    104-107 : storeF<0-3>
+
+    110-111 : sliceI, sliceF
+    110-111 : sliceIF<0-1>
+
+    108-109 ~ offsetF
+
+    112-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc
+    125-127 ~ readFs, readIs, swapBuf
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-124  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidIX = (tid & 15) << 3
+// tidFX = (tid & 15) << 2
+// tidY = tid >> 4
+--:-:-:-:1      LOP.AND tid15, tid,  15;
+--:-:-:-:1      SHL     tidIX, tid15, 3;
+--:-:-:-:1      SHL     tidFX, tid15, 2;
+--:-:-:-:1      SHR.U32 tidY,  tid,   4;
+
+// trackF += blkF*64 + tidFX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidFX, 6;
+
+// trackI += blkI*128 + tidIX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidIX, 7;
+
+// writeFs = (64*tidY + tidFX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidFX, 6;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// Remap the IX dim to avoid bank conflicts when storing to shared
+
+// writeIs = (128*tidY + tidFX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidFX, 7;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs = ((tid & 48) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    48;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 64) >> 3) | ((tid >> 1) & 7)
+--:-:-:-:1      LOP.AND tid64,  tid,    64;
+--:-:-:-:1      SHR.U32 tid64,  tid64,  3;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid64;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.64    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128 loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero];
+
+12:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:2:-:2      [+ convert() +] storeF0, loadF0.H0;
+
+02:1:-:-:2      STS.128 [writeFs], storeF0;
+
+25:-:-:-:1      [+ convert() +] storeI7, loadI3.H1;
+--:-:-:-:1      [+ convert() +] storeI6, loadI3.H0;
+--:-:-:-:1      [+ convert() +] storeI5, loadI2.H1;
+--:-:2:-:1      [+ convert() +] storeI4, loadI2.H0;
+--:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:3:-:1      [+ convert() +] storeI0, loadI0.H0;
+
+02:-:-:-:1      STS.128 [writeIs + 4x<64>], storeI4;
+04:1:-:-:1      STS.128 [writeIs + 4x<00>], storeI0;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.64 loadF, [trackF];
+--:-:3:-:1  @P1 LDG.E.128   loadI, [trackI];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c33 => "02:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c41 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c45 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c60 => "02:2:-:-:1  \@P0 STS.128 [writeFs], storeF0;\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "22:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.64 loadF, [trackF];\n",
+
+
+        j5c29 => "04:-:-:-:1  \@P0 $convert storeI7, loadI3.H1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 $convert storeI6, loadI3.H0;\n",
+        j5c37 => "--:-:-:-:1  \@P0 $convert storeI5, loadI2.H1;\n",
+        j5c41 => "--:-:6:-:1  \@P0 $convert storeI4, loadI2.H0;\n",
+        j5c45 => "--:-:-:-:1  \@P0 $convert storeI3, loadI1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert storeI2, loadI1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert storeI1, loadI0.H1;\n",
+        j5c57 => "--:-:3:-:1  \@P0 $convert storeI0, loadI0.H0;\n",
+
+        j5c59 => "20:-:-:-:1  \@P0 STS.128 [writeIs + 4x<64>], storeI4;\n",
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeIs + 4x<00>], storeI0;\n",
+
+        j6c55 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j6c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j6c62 => "04:-:3:-:1  \@P1 LDG.E.128 loadI, [trackI];\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "--:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3 + (tid & 64)
+// tidOY = (tid & 63) >> 3
+02:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      SHL     tidOX,  tidOX,  3;
+--:-:-:-:1      LOP.AND tidOX2, tid,    64;
+--:-:-:-:1      LOP.OR  tidOX,  tidOX,  tidOX2;
+--:-:-:-:1      LOP.AND tidOY,  tid,    63;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Expand back out to undo our bank conflict avoiding stride
+--:-:-:-:1      SHL readIs, readIs, 1;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*64 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 6;
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Pascal/hconv_xprop_X64_N64.sass b/Kernel/Convolution/Pascal/hconv_xprop_X64_N64.sass
new file mode 100644
index 0000000..16b92c5
--- /dev/null
+++ b/Kernel/Convolution/Pascal/hconv_xprop_X64_N64.sass
@@ -0,0 +1,290 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $int16;
+    our $prefix = 'h';
+    our $shareI = 64;
+    our $shareF = 64;
+    our $stepI  = 32;
+    our $stepF  = 32;
+    our $remapF = 1;
+    our $remapI = 1;
+    our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+    sub convert {return $convert;}
+
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 64*8*2 + 0>
+    szShareF  : (64*8)
+    szShareI  : (64*8)
+
+    addr_zero  : 4x<64*8*2 + 64*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 64*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 64*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 64*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 64*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 64*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 64*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 64*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-111 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-103 : loadI<0-3>
+    100-103 : storeI<0-3>
+    104-107 : storeI<4-7>
+
+    108-111 : loadF<0-3>
+    108-111 : storeF<0-3>
+    104-107 : storeF<4-7>
+
+    104-107 ~ offsetF
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    114-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetIc, offsetFc
+    126-127 ~ readFs, readIs
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-125  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+
+[+ get_mpqk() +]
+
+// tidX = (tid & 7) << 3
+// tidY = tid >> 3
+--:-:-:-:1      LOP.AND tidX, tid,  7;
+--:-:-:-:1      SHL     tidX, tidX, 3;
+--:-:-:-:1      SHR.U32 tidY, tid,  3;
+
+// trackF += blkF*64 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 6;
+
+// trackI += blkI*64 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 6;
+
+// Remap the X dim to avoid bank conflicts when storing to shared
+// We can unmap this in the output
+--:-:-:-:1      SHR.U32 tidX, tidX, 1;
+
+// writeS = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 6;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+// readFs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:0      SHL     readFs, readFs, 4;
+
+// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:1:-:1  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:5:-:1 @!P1 LDS.U.128    loadF0, [addr_zero];
+
+--:-:2:-:1  @P1 LDG.E.128 loadI0, [trackI];
+--:-:6:-:1 @!P1 LDS.U.128 loadI0, [addr_zero];
+
+11:-:-:-:1      [+ convert() +] storeF7, loadF3.H1;
+--:-:-:-:1      [+ convert() +] storeF6, loadF3.H0;
+--:-:-:-:1      [+ convert() +] storeF5, loadF2.H1;
+--:-:1:-:1      [+ convert() +] storeF4, loadF2.H0;
+--:-:-:-:1      [+ convert() +] storeF3, loadF1.H1;
+--:-:-:-:1      [+ convert() +] storeF2, loadF1.H0;
+--:-:-:-:1      [+ convert() +] storeF1, loadF0.H1;
+--:-:5:-:1      [+ convert() +] storeF0, loadF0.H0;
+
+01:1:-:-:1      STS.128 [writeS + 4x<0*64 + 32>], storeF4;
+10:-:-:-:1      STS.128 [writeS + 4x<0*64 +  0>], storeF0;
+
+23:-:-:-:1      [+ convert() +] storeI7, loadI3.H1;
+--:-:-:-:1      [+ convert() +] storeI6, loadI3.H0;
+--:-:-:-:1      [+ convert() +] storeI5, loadI2.H1;
+--:-:1:-:1      [+ convert() +] storeI4, loadI2.H0;
+--:-:-:-:1      [+ convert() +] storeI3, loadI1.H1;
+--:-:-:-:1      [+ convert() +] storeI2, loadI1.H0;
+--:-:-:-:1      [+ convert() +] storeI1, loadI0.H1;
+--:-:5:-:1      [+ convert() +] storeI0, loadI0.H0;
+
+01:-:-:-:1      STS.128 [writeS + 4x<8*64 + 32>], storeI4;
+10:1:-:-:1      STS.128 [writeS + 4x<8*64 +  0>], storeI0;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];
+--:-:3:-:1  @P1 LDG.E.128    loadI0, [trackI + 4x< 0>];
+
+[-
+    our $convert;
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c20 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j1c25 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j1c31 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j1c32 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j1c18 => "02:-:-:-:1  \@P0 $convert storeF7, loadF3.H1;\n",
+        j1c22 => "--:-:-:-:1  \@P0 $convert storeF6, loadF3.H0;\n",
+        j1c26 => "--:-:-:-:1  \@P0 $convert storeF5, loadF2.H1;\n",
+        j1c30 => "--:-:5:-:1  \@P0 $convert storeF4, loadF2.H0;\n",
+        j1c33 => "--:-:-:-:1  \@P0 $convert storeF3, loadF1.H1;\n",
+        j1c37 => "--:-:-:-:1  \@P0 $convert storeF2, loadF1.H0;\n",
+        j1c41 => "--:-:-:-:1  \@P0 $convert storeF1, loadF0.H1;\n",
+        j1c45 => "--:-:2:-:1  \@P0 $convert storeF0, loadF0.H0;\n",
+
+        j1c47 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 + 32>], storeF4;\n",
+        j1c62 => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 +  0>], storeF0;\n",
+
+        j2c19 => "30:-:-:-:1  \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c24 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     1;\n",
+        j2c26 => "--:-:-:-:1  \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c28 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 1;\n",
+
+        j2c30 => "02:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF];\n",
+
+        j5c29 => "04:-:-:-:1  \@P0 $convert storeI7, loadI3.H1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 $convert storeI6, loadI3.H0;\n",
+        j5c37 => "--:-:-:-:1  \@P0 $convert storeI5, loadI2.H1;\n",
+        j5c41 => "--:-:5:-:1  \@P0 $convert storeI4, loadI2.H0;\n",
+        j5c45 => "--:-:-:-:1  \@P0 $convert storeI3, loadI1.H1;\n",
+        j5c49 => "--:-:-:-:1  \@P0 $convert storeI2, loadI1.H0;\n",
+        j5c53 => "--:-:-:-:1  \@P0 $convert storeI1, loadI0.H1;\n",
+        j5c57 => "--:-:3:-:1  \@P0 $convert storeI0, loadI0.H0;\n",
+
+        j5c59 => "10:-:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 + 32>], storeI4;\n",
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 +  0>], storeI0;\n",
+
+        j6c50 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     1;\n",
+        j6c55 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 1;\n",
+
+        j6c61 => "04:-:3:-:1  \@P1 LDG.E.128 loadI0, [trackI];\n",
+
+        j6c62 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 3
+// tidOY = tid >> 3
+02:-:-:-:1      LOP.AND tidOX, tid,   7;
+--:-:-:-:1      SHL     tidOX, tidOX, 3;
+--:-:-:-:1      SHR.U32 tidOY, tid,   3;
+
+--:-:-:-:1      LOP.AND readIs, readIs, 0x7ff;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x7ff;
+
+// Expand back out to undo our bank conflict avoiding stride
+--:-:-:-:1      SHL readIs, readIs, 1;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 64 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs  = 4 * (tidOX + (tidOY * 64))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// Mul by 2 again to undo the bank conflict avoiding stride
+// k = blkF*64 + tidOY * 8
+--:-:-:-:1      SHL    tidOY,   tidOY, 3;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 6;
+
+[+ output_setup(63, 0, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
diff --git a/Kernel/Convolution/Pascal/persistent_rnn_bprop.sass b/Kernel/Convolution/Pascal/persistent_rnn_bprop.sass
new file mode 100644
index 0000000..ddddb22
--- /dev/null
+++ b/Kernel/Convolution/Pascal/persistent_rnn_bprop.sass
@@ -0,0 +1,638 @@
+# Kernel: presistent_birnn
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(64*48)>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_d[0]        : c[0x0][0x140]
+    param_d[1]        : c[0x0][0x144]
+    param_dnext[0]    : c[0x0][0x148]
+    param_dnext[1]    : c[0x0][0x14c]
+    param_h[0]        : c[0x0][0x150]
+    param_h[1]        : c[0x0][0x154]
+    param_w[0]        : c[0x0][0x158]
+    param_w[1]        : c[0x0][0x15c]
+    param_lockAddr[0] : c[0x0][0x160]
+    param_lockAddr[1] : c[0x0][0x164]
+    param_ldd         : c[0x0][0x168]
+    param_ldh         : c[0x0][0x16c]
+    param_ldw         : c[0x0][0x170]
+    param_bsz         : c[0x0][0x174]
+    param_seqLength   : c[0x0][0x178]
+    param_numBlks     : c[0x0][0x17c]
+    param_rowSize     : c[0x0][0x180]
+    param_reverse     : c[0x0][0x184]
+    param_reluclip    : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+      0-215 : weight<000-215>
+    216-227 : accum<00-11>
+    228-231 : timeStep, warpTid, rowOffset, tid
+
+    232-235 : wAddr<0-1>, biasAddr<0-1>
+    236-254 ~ bid, ldw, wRow, loadRow, tidLsbs, tidMsbs, warpIndex, storeWeights, loadWeights, outRow, rowSize
+
+    232-249 : loadBuffer<0-3>, delta0r<0-3>, delta1r<0-3>, delta2r<0-3>, dnextAddr<0-1>
+    250-254 ~ loadDeltas, storeDeltas, loadIndex, dOffset, ldd
+
+    236-247 : peerR0V<0-3>, peerR1V<0-3>, peerR2V<0-3>
+    244     : hOffset
+    248-253 : h<0-3>, hAddr<0-1>
+
+    232-241 : output<0-3>, dAddr<0-1>, lockAddr<0-1>, expectVal, setVal
+    241-245 ~ storeIndex, hRow, predSave, lockVal, reluclip
+
+</REGISTER_MAPPING>
+
+//Get tid/block id
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:2:-:1      S2R bid, SR_CTAID.X;
+
+//Store zeros at addr_zero
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV     ldw,       param_ldw;
+--:-:-:-:1      MOV     rowSize,   param_rowSize;
+
+//timeStep = (param_reverse == 0) ? 0 : param_seqLength
+--:-:-:-:1      ISETP.EQ.AND P2, PT, RZ, param_reverse, PT;
+--:-:-:-:1      SEL timeStep, RZ, param_seqLength, P2;
+--:-:-:-:1 @!P2 IADD timeStep, timeStep, -1;
+
+//warpIndex = threadIdx.x >> 5
+01:-:-:-:1      SHR.U32 warpIndex, tid, 5;
+
+//warpTid = threadIdx.x & 0x1f
+01:-:-:-:1      LOP.AND warpTid,   tid, 0x1f;
+
+//rowOffset = ((blockIdx.x << 3) + warp_index) * 6
+02:-:-:-:1      SHL     rowOffset, bid,       3;
+--:-:-:-:1      IADD    rowOffset, rowOffset, warpIndex;
+--:-:-:-:1      XMAD    rowOffset, rowOffset, 6, RZ;
+
+//if(warp_tid > 15) rowOffset += 3
+--:-:-:-:1      ISETP.GT.AND P1, PT, warpTid, 15, PT;
+--:-:-:-:1  @P1 IADD     rowOffset, rowOffset, 3;
+
+//warpTid = warpTid & 0x0f
+--:-:-:-:1      LOP.AND  warpTid, warpTid, 0x0f;
+--:-:-:-:1      ISETP.LT.AND P0, PT, warpTid, 3, PT;
+--:-:-:-:1      IADD     outRow, rowOffset, warpTid;
+--:-:-:-:1      ISETP.LT.AND P0, PT, outRow, param_rowSize, P0;
+
+//storeWeights = (((tid >> 2) * 48) + ((tid & 3) << 2)) << 2
+//wRow = ((tid >> 2) * ldw) + ((tid & 3) << 2) + (bid * 48)
+--:-:-:-:1      LOP.AND tidLsbs, warpTid, 0x03;
+--:-:-:-:1      SHR     tidMsbs, tid, 2;
+--:-:-:-:1      SHL     tidLsbs, tidLsbs, 2;
+
+--:-:-:-:1      XMAD    loadRow, bid, 48, tidLsbs;
+--:-:-:-:1      XMAD    wRow, tidMsbs, ldw, loadRow;
+
+--:-:-:-:1      XMAD    storeWeights, tidMsbs, 48, tidLsbs;
+--:-:-:-:1      SHL     storeWeights, storeWeights, 2;
+
+//loadWeights = (((warpTid * 8) + warpIndex) * 6) + (P1 ? 3 : 0)) << 2
+--:-:-:-:1      XMAD    loadWeights, warpTid, 8, warpIndex;
+--:-:-:-:1      XMAD    loadWeights, loadWeights, 6, RZ;
+--:-:-:-:1  @P1 IADD    loadWeights, loadWeights, 3;
+--:-:-:-:1      SHL     loadWeights, loadWeights, 2;
+
+//wAddr = &w[wRow]
+--:-:-:-:1      LEA      wAddr0.CC, wRow, param_w[0],     2;
+--:-:-:-:1      LEA.HI.X wAddr1,    wRow, param_w[1], RZ, 2;
+
+//ldw = ldw << 6
+--:-:-:-:1      SHL      ldw,  ldw,       8;
+
+//Compute row loading predicates
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidMsbs, rowSize, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, loadRow, rowSize, P1;
+--:-:-:-:1      IADD     rowSize, rowSize, -16;
+--:-:-:-:1      ISETP.LT.AND P4, PT, loadRow, rowSize, P1;
+--:-:-:-:1      IADD     rowSize, rowSize, -16;
+--:-:-:-:1      ISETP.LT.AND P5, PT, loadRow, rowSize, P1;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:c NOP;
+
+//Load weights to registers
+<CODE>
+    my $out;
+    my $regId = 0;
+    my $rowsize = 1152;
+
+    for (my $col=0; $col < $rowsize; $col += 64)
+    {
+        $out .= "--:-:-:-:1      IADD tidMsbs, tidMsbs, 64;\n";
+
+        #Use vector loads from weight matrix
+        $regId = $col / 16;
+        $out .= sprintf "--:-:1:-:1  \@P3 LDG.E.128 weight%03d, [wAddr];\n", $regId;
+        $out .= sprintf "--:-:1:-:1 \@!P3 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+        $regId = $col / 16 + 72;
+        $out .= sprintf "--:-:2:-:1  \@P4 LDG.E.128 weight%03d, [wAddr + 4x<16>];\n", $regId;
+        $out .= sprintf "--:-:2:-:1 \@!P4 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+        $regId = $col / 16 + 144;
+        $out .= sprintf "--:-:3:-:1  \@P5 LDG.E.128 weight%03d, [wAddr + 4x<32>];\n", $regId;
+        $out .= sprintf "--:-:3:-:1 \@!P5 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+
+        $out .= "--:-:-:-:1      ISETP.LT.AND P3, PT, tidMsbs, param_rowSize, P3;\n";
+        $out .= "--:-:-:-:1      ISETP.LT.AND P4, PT, tidMsbs, param_rowSize, P4;\n";
+        $out .= "--:-:-:-:1      ISETP.LT.AND P5, PT, tidMsbs, param_rowSize, P5;\n";
+
+        #Store weights into shared memory
+        if ($col > 0)
+        {
+            $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+        }
+
+        $regId = $col / 16;
+        $out .= sprintf "01:-:-:-:1      STS.U.128 [storeWeights], weight%03d;\n", $regId;
+        $regId = $col / 16 + 72;
+        $out .= sprintf "02:-:-:-:1      STS.U.128 [storeWeights + 4x<16>], weight%03d;\n", $regId;
+        $regId = $col / 16 + 144;
+        $out .= sprintf "04:-:-:-:1      STS.U.128 [storeWeights + 4x<32>], weight%03d;\n", $regId;
+
+        $out .= "--:-:-:-:6      IADD   wAddr0.CC, wAddr0, ldw;\n";
+        $out .= "--:-:-:-:1      IADD.X wAddr1,    wAddr1, RZ;\n\n";
+
+        #Load each weight from shared mem
+        $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+
+        foreach my $row (0 .. 2)
+        {
+            foreach my $shared_col (0 .. 3)
+            {
+                my $control;
+
+                if (($col + 64) >= $rowsize && $row == 2 && $shared_col == 3)
+                {
+                    $control = "--:1:6:-:2";
+                }
+                else
+                {
+                    $control = "--:-:-:-:1";
+                }
+
+                $regId = ($row * 72) + ($col / 16) + $shared_col;
+                my $shared_offset = $row + ($shared_col * 16 * 48);
+                $out .= sprintf "%s      LDS.U weight%03d, [loadWeights + 4x<%d>];\n", $control, $regId, $shared_offset;
+            }
+        }
+    }
+
+    $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+
+    return $out;
+
+</CODE>
+
+//Predicates for store code
+--:-:-:-:1      ISETP.EQ.AND P2, PT, warpTid, 0, PT;
+--:-:-:-:1      ISETP.EQ.AND P3, PT, warpTid, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P4, PT, warpTid, 2, PT;
+
+UNROLLING_LOOP:
+<SCHEDULE_BLOCK>
+//Prime inner product loop by loading first rows of dnext
+--:-:-:-:1      MOV loadIndex,    tid;
+
+//storeDeltas = tid << 4
+--:-:-:-:1      SHL storeDeltas, tid, 4;
+--:-:-:-:1      SHL loadDeltas, warpTid, 4;
+
+//dnextAddr = &d_next[timeStep * ldd + loadIndex]
+--:-:-:-:1      XMAD     dOffset,        loadIndex, param_ldd,      timeStep;
+--:-:-:-:1      LEA      dnextAddr0.CC,  dOffset,   param_dnext[0],     4;
+01:-:-:-:2      LEA.HI.X dnextAddr1,     dOffset,   param_dnext[1], RZ, 4;
+
+//loadBuffer = *dnextAddr
+--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;
+--:5:1:-:2  @P1 LDG.E.CI.128 loadBuffer, [dnextAddr];
+--:5:1:-:2 @!P1 LDS.U.CI.128 loadBuffer, [addr_zero];
+
+//ldd = param_ldd << 12
+--:-:-:-:1      MOV ldd, param_ldd;
+--:-:-:-:1      SHL ldd, ldd, 12;
+</SCHEDULE_BLOCK>
+
+//Initialize all accumulation registers to 0
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 accum%02d, [addr_zero];\n", $_ * 4), 0..2;
+</CODE>
+
+//Update load index and load address
+--:-:-:-:6      IADD loadIndex, loadIndex, 256;
+--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;
+10:-:-:-:6      IADD   dnextAddr0.CC, dnextAddr0, ldd;
+--:-:-:-:6      IADD.X dnextAddr1,    dnextAddr1, RZ;
+
+01:-:-:-:1      STS.U.128 [storeDeltas], loadBuffer;
+
+//Unrolled GEMM loop
+<CODE>
+    our @top;
+
+    my $out = join '', @top;
+
+    my $rowsize = 1152;
+    my $weight_index = 0;
+
+    my $wait_flag = 2;
+    my $set_flag = 4;
+    my $read_buffer = 0;
+    my $write_buffer = 2;
+
+    for (my $k=0; $k < $rowsize; $k+=256)
+    {
+        if ($k == 0)
+        {
+            $out .= "--:6:1:-:1  \@P1 LDG.E.CI.128 loadBuffer, [dnextAddr];\n";
+            $out .= "--:-:1:-:1 \@!P1 LDS.U.128    loadBuffer, [addr_zero];\n\n";
+            $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+            $out .= "--:-:2:-:1      LDS.U.128 delta0r, [loadDeltas];\n";
+            $out .= "--:-:3:-:1      LDS.U.128 delta1r, [loadDeltas + 4x<4*16>];\n\n";
+        }
+        $out .= "--:-:-:-:1      LOP.XOR storeDeltas, storeDeltas, 4096;\n";
+
+        foreach my $shared_row (0 .. 15)
+        {
+            if($weight_index < 72)
+            {
+                if ($shared_row < 14 && ($k + (16 * ($shared_row + 2))) < $rowsize)
+                {
+                    my $read_bar = "-";
+                    if ($shared_row == 13 && ($k + 256) < $rowsize)
+                    {
+                        $read_bar = "5";
+                    }
+                    $out .= sprintf "--:%s:%d:-:1      LDS.U.128 delta%dr, [loadDeltas + 4x<4*%d>];\n", $read_bar, $set_flag, $write_buffer, (16 * ($shared_row + 2));
+                }
+
+                if ($shared_row == 11 && ($k + 512) < $rowsize)
+                {
+                    $out .= "--:-:-:-:1      IADD loadIndex, loadIndex, 256;\n";
+                    $out .= "20:-:-:-:1      IADD dnextAddr0.CC, dnextAddr0, ldd;\n";
+                }
+
+                if ($shared_row == 12 && ($k + 512) < $rowsize)
+                {
+                    $out .= "--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;\n";
+                    $out .= "--:-:-:-:1      IADD.X dnextAddr1,    dnextAddr1, RZ;\n";
+                }
+
+                if ($shared_row == 13)
+                {
+                    $out .= "01:-:-:-:1      STS.U.128 [storeDeltas], loadBuffer;\n";
+
+                    if(($k + 512) < $rowsize)
+                    {
+                        $out .= "--:6:1:-:1  \@P1 LDG.E.CI.128 loadBuffer, [dnextAddr];\n";
+                        $out .= "--:-:1:-:1 \@!P1 LDS.U.128    loadBuffer, [addr_zero];\n\n";
+                    }
+                    else
+                    {
+                        $out .= "--:-:-:-:6      IADD     dOffset,        rowOffset, warpTid;\n";
+                        $out .= "--:-:-:-:6      XMAD     dOffset,        dOffset,   param_ldd,  timeStep;\n";
+                        $out .= "--:-:-:-:6      LEA      dnextAddr0.CC,  dOffset,   param_d[0],      4;\n";
+                        $out .= "--:-:-:-:2      LEA.HI.X dnextAddr1,     dOffset,   param_d[1], RZ, 4;\n";
+                        $out .= "--:-:6:-:1 \@P0 LDG.E.CI.128 loadBuffer, [dnextAddr];\n\n";
+                    }
+                }
+
+                if ($shared_row == 14 && ($k + 256) < $rowsize)
+                {
+                    $out .= "10:-:-:-:1      LOP.XOR loadDeltas, loadDeltas, 4096;\n";
+                    $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+                    $out .= sprintf "--:-:%d:-:1      LDS.U.128 delta%dr, [loadDeltas];\n", $set_flag, $write_buffer;
+                }
+
+                if ($shared_row == 15 && ($k + 256) < $rowsize)
+                {
+                    $out .= sprintf "--:-:%d:-:1      LDS.U.128 delta%dr, [loadDeltas + 4x<4*16>];\n\n", $set_flag, $write_buffer;
+                }
+
+                foreach my $row (0 .. 2)
+                {
+                    my $weight = ($row * 72) + $weight_index;
+
+                    foreach my $col (0 .. 3)
+                    {
+                        my $accum = ($row * 4) + $col;
+                        my $wait = "--";
+                        my $stall = 1;
+                        if ($accum == 0)
+                        {
+                            if ($weight_index == 0)
+                            {
+                                $wait = sprintf "%02x", (0x20 | (1 << ($wait_flag - 1)));
+                            }
+                            else
+                            {
+                                $wait = sprintf "%02x", (1 << ($wait_flag - 1));
+                            }
+                        }
+
+                        if ($row == 2 && $col == 3)
+                        {
+                            if ($shared_row < 13 && ($k + (16 * ($shared_row + 3))) < $rowsize)
+                            {
+                                $stall = 0;
+                            }
+                            elsif ($shared_row == 14 && ($k + 256) < $rowsize)
+                            {
+                                $stall = 0;
+                            }
+                        }
+
+                        $out .= sprintf "%s:-:-:-:%d      FFMA accum%02d, weight%03d, delta%dr%d, accum%02d;\n", $wait, $stall, $accum, $weight, $read_buffer, $col, $accum;
+                    }
+                }
+
+                $weight_index++;
+            }
+
+            $wait_flag += 1;
+            $set_flag += 1;
+            $read_buffer += 1;
+            $write_buffer += 1;
+            if($wait_flag == 5)
+            {
+                $wait_flag = 2;
+            }
+            if($set_flag == 5)
+            {
+                $set_flag = 2;
+            }
+            if($read_buffer == 3)
+            {
+                $read_buffer = 0;
+            }
+            if($write_buffer == 3)
+            {
+                $write_buffer = 0;
+            }
+        }
+    }
+
+    return $out;
+</CODE>
+
+//Load hidden states
+--:-:-:-:6      IADD     hOffset,    rowOffset, warpTid;
+--:-:-:-:6      XMAD     hOffset,    hOffset,   param_ldh,  timeStep;
+--:-:-:-:6      LEA      hAddr0.CC,  hOffset,   param_h[0],      4;
+--:-:-:-:2      LEA.HI.X hAddr1,     hOffset,   param_h[1], RZ, 4;
+--:-:5:-:1 @P0  LDG.E.CI.128 h, [hAddr];
+
+//Reduction between threads
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 1, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 1, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 1, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 1, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 1, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 2, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 2, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 2, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 2, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 2, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 2, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 2, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 2, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 4, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 4, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 4, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 4, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 4, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 4, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 4, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 4, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 8, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 8, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 8, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 8, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 8, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 8, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 8, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 8, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:5      MOV reluclip, param_reluclip;
+
+//Compute store pointer
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD     hRow,       rowOffset,  warpTid;
+--:-:-:-:1      XMAD     storeIndex, hRow,       param_ldd,  timeStep;
+--:-:-:-:1      LEA      dAddr0.CC,  storeIndex, param_d[0],      4;
+--:-:-:-:1      LEA.HI.X dAddr1,     storeIndex, param_d[1], RZ, 4;
+--:-:-:-:1      LEA      lockAddr0,  timeStep,   param_lockAddr[0], 2;
+--:-:-:-:1      LEA.HI.X lockAddr1,  timeStep,   param_lockAddr[1], RZ, 2;
+
+//Conditional select for output
+//TODO: make sure scheduler orders these such that first one waits on barrier
+20:-:-:-:1  @P2 FADD output0, output0, accum00;
+20:-:-:-:1  @P3 FADD output0, output0, accum04;
+20:-:-:-:1  @P4 FADD output0, output0, accum08;
+
+20:-:-:-:1  @P2 FADD output1, output1, accum01;
+20:-:-:-:1  @P3 FADD output1, output1, accum05;
+20:-:-:-:1  @P4 FADD output1, output1, accum09;
+
+20:-:-:-:1  @P2 FADD output2, output2, accum02;
+20:-:-:-:1  @P3 FADD output2, output2, accum06;
+20:-:-:-:1  @P4 FADD output2, output2, accum10;
+
+20:-:-:-:1  @P2 FADD output3, output3, accum03;
+20:-:-:-:1  @P3 FADD output3, output3, accum07;
+20:-:-:-:3  @P4 FADD output3, output3, accum11;
+</SCHEDULE_BLOCK>
+
+//Save select predicates
+//TODO: how many stall cycles needed here?
+--:-:-:-:6      P2R predSave, PR, RZ, 0x1e;
+
+//Multiply by bprop for reclinclip activation function
+//TODO: others
+<SCHEDULE_BLOCK>
+10:-:-:-:1      FSETP.LT.AND P2, PT, RZ, h0, PT;
+10:-:-:-:1      FSETP.LT.AND P3, PT, RZ, h1, PT;
+10:-:-:-:1      FSETP.LT.AND P4, PT, RZ, h2, PT;
+10:-:-:-:1      FSETP.LT.AND P5, PT, RZ, h3, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, h0, reluclip, P2;
+--:-:-:-:1      FSETP.LT.AND P3, PT, h1, reluclip, P3;
+--:-:-:-:1      FSETP.LT.AND P4, PT, h2, reluclip, P4;
+--:-:-:-:1      FSETP.LT.AND P5, PT, h3, reluclip, P5;
+--:-:-:-:1 @!P2 FMUL output0, output0, RZ;
+--:-:-:-:1 @!P3 FMUL output1, output1, RZ;
+--:-:-:-:1 @!P4 FMUL output2, output2, RZ;
+--:-:-:-:1 @!P5 FMUL output3, output3, RZ;
+
+//Update timestep
+--:-:-:-:1      ISETP.EQ.AND P1, PT, RZ, param_reverse, PT;
+--:-:-:-:1  @P1 MOV setVal, 1;
+--:-:-:-:1 @!P1 MOV setVal, -1;
+--:-:-:-:1  @P1 MOV expectVal, param_seqLength;
+--:-:-:-:1 @!P1 MOV expectVal, -1;
+--:-:-:-:1      IADD timeStep, timeStep, setVal;
+</SCHEDULE_BLOCK>
+
+//Conditional store
+--:-:-:-:5  @P0 STG.E.CI.128 [dAddr], output;
+
+//Compute predicate for time unrolling loop
+--:-:-:Y:d      ISETP.NE.AND P5, PT, timeStep, expectVal, PT;
+
+//P2 = (tid != 0)
+//setVal = 1
+--:-:-:-:1      ISETP.NE.AND P2, PT, tid, RZ, PT;
+--:-:-:-:1      MOV expectVal, param_numBlks;
+--:-:-:Y:b      MOV setVal, 1;
+
+//Barrier for all blocks
+--:-:-:-:f      MEMBAR.GL;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:2      SSY SSY_TARGET1;
+--:-:-:-:d  @P2 SYNC;
+
+--:-:-:Y:2      ATOM.E.ADD RZ, [lockAddr], setVal;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET1:
+--:-:-:-:1      SSY SSY_TARGET2;
+--:-:-:-:d  @P2 SYNC;
+
+SPINLOCK:
+--:-:1:Y:2      LDG.E lockVal, [lockAddr];
+01:-:-:Y:d      ISETP.NE.AND P2, PT, lockVal, expectVal, PT;
+--:-:-:-:5  @P2 BRA.U SPINLOCK;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET2:
+--:-:-:-:5      BAR.SYNC 0;
+
+//Restore select predicates
+--:-:-:-:1      R2P PR, predSave, 0x1e;
+
+//Conditional branch back to beginning of loop
+--:-:-:Y:5  @P5 BRA.U UNROLLING_LOOP;
+
+--:-:-:-:5      EXIT;
diff --git a/Kernel/Convolution/Pascal/persistent_rnn_fprop.sass b/Kernel/Convolution/Pascal/persistent_rnn_fprop.sass
new file mode 100644
index 0000000..6a11539
--- /dev/null
+++ b/Kernel/Convolution/Pascal/persistent_rnn_fprop.sass
@@ -0,0 +1,653 @@
+# Kernel: presistent_birnn
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(64*48)>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_h[0]        : c[0x0][0x140]
+    param_h[1]        : c[0x0][0x144]
+    param_hprev[0]    : c[0x0][0x148]
+    param_hprev[1]    : c[0x0][0x14c]
+    param_bias[0]     : c[0x0][0x150]
+    param_bias[1]     : c[0x0][0x154]
+    param_w[0]        : c[0x0][0x158]
+    param_w[1]        : c[0x0][0x15c]
+    param_lockAddr[0] : c[0x0][0x160]
+    param_lockAddr[1] : c[0x0][0x164]
+    param_ldh         : c[0x0][0x168]
+    param_ldw         : c[0x0][0x16c]
+    param_bsz         : c[0x0][0x170]
+    param_seqLength   : c[0x0][0x174]
+    param_numBlks     : c[0x0][0x178]
+    param_rowSize     : c[0x0][0x17c]
+    param_reverse     : c[0x0][0x180]
+    param_reluclip    : c[0x0][0x184]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+      0-215 : weight<000-215>
+    216-227 : accum<00-11>
+    228-229 : timeStep, biasValue
+    230-232 : warpTid, rowOffset, tid
+
+    233     : bid
+
+    236-243 : wAddr0r<0-1>, wAddr1r<0-1>, wAddr2r<0-1>, biasAddr<0-1>
+    244-254 ~ ldw, wRow, warpTid4, loadRow, warpIndex, storeWeights, loadWeights, rowSize
+
+    233     : hOffset
+    233     : ldh
+    234-239 : hprevAddr<0-1>, loadBuffer<0-3>
+    240-251 : hidden0r<0-3>, hidden1r<0-3>, hidden2r<0-3>
+    252-254 ~ loadHiddens, storeHiddens, loadIndex
+
+    240-251 : peerR0V<0-3>, peerR1V<0-3>, peerR2V<0-3>
+
+    240-249 : output<0-3>, hAddr<0-1>, lockAddr<0-1>, expectVal, setVal
+    250-254 ~ storeIndex, hRow, predSave, lockVal, reluclip
+
+</REGISTER_MAPPING>
+
+//Get tid/block id
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:2:-:1      S2R bid, SR_CTAID.X;
+
+//Store zeros at addr_zero
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV     ldw,       param_ldw;
+--:-:-:-:1      MOV     rowSize,   param_rowSize;
+
+//timeStep = (param_reverse == 0) ? 0 : param_seqLength
+--:-:-:-:1      ISETP.EQ.AND P2, PT, RZ, param_reverse, PT;
+--:-:-:-:1      SEL timeStep, RZ, param_seqLength, P2;
+--:-:-:-:1 @!P2 IADD timeStep, timeStep, -1;
+
+//warpIndex = threadIdx.x >> 5
+01:-:-:-:1      SHR.U32 warpIndex, tid, 5;
+
+//warpTid = threadIdx.x & 0x1f
+01:-:-:-:1      LOP.AND warpTid,   tid, 0x1f;
+
+//rowOffset = ((blockIdx.x << 3) + warp_index) * 6
+02:-:-:-:1      SHL     rowOffset, bid,       3;
+--:-:-:-:1      IADD    rowOffset, rowOffset, warpIndex;
+--:-:-:-:1      XMAD    rowOffset, rowOffset, 6, RZ;
+
+//if(warp_tid > 15) rowOffset += 3
+--:-:-:-:1      ISETP.GT.AND P1, PT, warpTid, 15, PT;
+--:-:-:-:1  @P1 IADD     rowOffset, rowOffset, 3;
+
+//warpTid = warpTid & 0x0f
+--:-:-:-:1      LOP.AND  warpTid, warpTid, 0x0f;
+--:-:-:-:1      ISETP.LT.AND P0, PT, warpTid, 3, PT;
+
+//warpTid4 = warpTid << 2
+--:-:-:-:1      SHL      warpTid4, warpTid, 2;
+
+//storeWeights = ((P1) ? (warpTid4 + 3*64) : warpTid4) << 2
+//loadWeights = ((P1) ? (warpTid + 3*64) : warpTid) << 2
+--:-:-:-:1  @P1 MOV      loadWeights, 3;
+--:-:-:-:1 @!P1 MOV      loadWeights, RZ;
+
+--:-:-:-:1      XMAD     loadWeights, warpIndex, 6, loadWeights;
+--:-:-:-:1      SHL      loadWeights, loadWeights, 6;
+
+--:-:-:-:1      IADD     storeWeights, loadWeights, warpTid4;
+--:-:-:-:1      IADD     loadWeights, loadWeights, warpTid;
+--:-:-:-:1      SHL      storeWeights, storeWeights, 2;
+--:-:-:-:1      SHL      loadWeights, loadWeights, 2;
+
+//wRow = rowOffset * ldw + warpTid
+--:-:-:-:1      XMAD     wRow, rowOffset, ldw, warpTid4;
+
+//wAddr0r = &w[wRow]
+--:-:-:-:1      LEA      wAddr0r0.CC, wRow, param_w[0],     2;
+--:-:-:-:1      LEA.HI.X wAddr0r1,    wRow, param_w[1], RZ, 2;
+
+//ldw = ldw << 2
+--:-:-:-:1      SHL      ldw,  ldw,       2;
+
+//wAddr1r = wAddr0r + ldw
+--:-:-:-:1      IADD     wAddr1r0.CC, wAddr0r0, ldw;
+--:-:-:-:1      IADD.X   wAddr1r1,    wAddr0r1, RZ;
+
+//wAddr2r = wAddr2r + ldw
+--:-:-:-:1      IADD     wAddr2r0.CC, wAddr1r0, ldw;
+--:-:-:-:1      IADD.X   wAddr2r1,    wAddr1r1, RZ;
+
+//Compute row loading predicates
+--:-:-:-:1      ISETP.LT.AND P1, PT, warpTid4, rowSize, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, rowOffset, rowSize, P1;
+--:-:-:-:1      IADD     rowSize, rowSize, -1;
+--:-:-:-:1      ISETP.LT.AND P4, PT, rowOffset, rowSize, P1;
+--:-:-:-:1      IADD     rowSize, rowSize, -1;
+--:-:-:-:1      ISETP.LT.AND P5, PT, rowOffset, rowSize, P1;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:c      NOP;
+
+//Load weights to registers
+<CODE>
+    my $out;
+    my $regId = 0;
+    my $rowsize = 1152;
+
+    for (my $col=0; $col < $rowsize; $col += 64)
+    {
+        $out .= "--:-:-:-:1      IADD warpTid4, warpTid4, 64;\n";
+
+        #Use vector loads from weight matrix
+        $regId = $col / 16;
+        $out .= sprintf "--:-:1:-:1  \@P3 LDG.E.128 weight%03d, [wAddr0r + 4x<%d>];\n", $regId, $col;
+        $out .= sprintf "--:-:1:-:1 \@!P3 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+        $regId = $col / 16 + 72;
+        $out .= sprintf "--:-:2:-:1  \@P4 LDG.E.128 weight%03d, [wAddr1r + 4x<%d>];\n", $regId, $col;
+        $out .= sprintf "--:-:2:-:1 \@!P4 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+        $regId = $col / 16 + 144;
+        $out .= sprintf "--:-:3:-:1  \@P5 LDG.E.128 weight%03d, [wAddr2r + 4x<%d>];\n", $regId, $col;
+        $out .= sprintf "--:-:3:-:1 \@!P5 LDS.U.128 weight%03d, [addr_zero];\n", $regId;
+
+        $out .= "--:-:-:-:1      ISETP.LT.AND P3, PT, warpTid4, rowSize, P3;\n";
+        $out .= "--:-:-:-:1      ISETP.LT.AND P4, PT, warpTid4, rowSize, P4;\n";
+        $out .= "--:-:-:-:1      ISETP.LT.AND P5, PT, warpTid4, rowSize, P5;\n";
+
+        #Store weights into shared memory
+        if ($col > 0)
+        {
+            $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+        }
+
+        $regId = $col / 16;
+        $out .= sprintf "01:-:-:-:1      STS.U.128 [storeWeights], weight%03d;\n", $regId;
+        $regId = $col / 16 + 72;
+        $out .= sprintf "02:-:-:-:1      STS.U.128 [storeWeights + 4x<64>], weight%03d;\n", $regId;
+        $regId = $col / 16 + 144;
+        $out .= sprintf "04:-:-:-:1      STS.U.128 [storeWeights + 4x<128>], weight%03d;\n", $regId;
+
+        #Load each weight from shared mem
+        $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+
+        foreach my $shared_col (0 .. 3)
+        {
+            foreach my $row (0 .. 2)
+            {
+                my $control;
+
+                if (($col + 64) >= $rowsize && $row == 2 && $shared_col == 3)
+                {
+                    $control = "--:1:6:-:2";
+                }
+                else
+                {
+                    $control = "--:-:-:-:1";
+                }
+
+                $regId = ($row * 72) + ($col / 16) + $shared_col;
+                my $shared_offset = ($row * 64) + ($shared_col * 16);
+                $out .= sprintf "%s      LDS.U weight%03d, [loadWeights + 4x<%d>];\n", $control, $regId, $shared_offset;
+            }
+        }
+    }
+
+    $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+
+    return $out;
+
+</CODE>
+
+//Conditional load of bias
+<SCHEDULE_BLOCK>
+01:-:-:-:1      IADD     loadRow,      rowOffset, warpTid;
+--:-:-:-:1      ISETP.LT.AND P0, PT, loadRow, param_rowSize, P0;
+--:-:-:-:1      LEA      biasAddr0.CC, loadRow,   param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X biasAddr1,    loadRow,   param_bias[1], RZ, 2;
+--:-:-:-:1  @P0 LDG.E    biasValue,    [biasAddr];
+--:-:-:-:1 @!P0 MOV      biasValue,    RZ;
+</SCHEDULE_BLOCK>
+
+//Predicates for store code
+--:-:-:-:1      ISETP.EQ.AND P2, PT, warpTid, 0, PT;
+--:-:-:-:1      ISETP.EQ.AND P3, PT, warpTid, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P4, PT, warpTid, 2, PT;
+
+UNROLLING_LOOP:
+<SCHEDULE_BLOCK>
+//Prime inner product loop by loading first rows of hprev
+--:-:-:-:1      MOV loadIndex,    tid;
+
+//storeHiddens = tid << 4
+--:-:-:-:1      SHL storeHiddens, tid, 4;
+--:-:-:-:1      SHL loadHiddens, warpTid, 4;
+
+//hprevAddr = &h_prev[timeStep * ldh + loadIndex]
+--:-:-:-:1      XMAD     hOffset,        loadIndex, param_ldh,      timeStep;
+--:-:-:-:1      LEA      hprevAddr0.CC,  hOffset,   param_hprev[0],     4;
+--:-:-:-:2      LEA.HI.X hprevAddr1,     hOffset,   param_hprev[1], RZ, 4;
+
+//loadBuffer = *hprevAddr
+--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;
+--:5:1:-:2  @P1 LDG.E.CI.128 loadBuffer, [hprevAddr];
+--:5:1:-:2 @!P1 LDS.U.128    loadBuffer, [addr_zero];
+
+//ldh = param_ldh << 12
+--:-:-:-:1      MOV ldh, param_ldh;
+--:-:-:-:1      SHL ldh, ldh, 12;
+</SCHEDULE_BLOCK>
+
+//Initialize all accumulation registers to 0
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 accum%02d, [addr_zero];\n", $_ * 4), 0..2;
+</CODE>
+
+//Update load index and load address
+--:-:-:-:6      IADD loadIndex, loadIndex, 256;
+--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;
+10:-:-:-:6      IADD   hprevAddr0.CC, hprevAddr0, ldh;
+--:-:-:-:6      IADD.X hprevAddr1,    hprevAddr1, RZ;
+
+01:-:-:-:1      STS.U.128 [storeHiddens], loadBuffer;
+
+//Unrolled GEMM loop
+<CODE>
+    our @top;
+
+    my $out = join '', @top;
+
+    my $rowsize = 1152;
+    my $weight_index = 0;
+
+    my $wait_flag = 2;
+    my $set_flag = 4;
+    my $read_buffer = 0;
+    my $write_buffer = 2;
+
+    for (my $k=0; $k < $rowsize; $k+=256)
+    {
+        if ($k == 0)
+        {
+            $out .= "--:6:1:-:1  \@P1 LDG.E.CI.128 loadBuffer, [hprevAddr];\n";
+            $out .= "--:-:1:-:1 \@!P1 LDS.U.128    loadBuffer, [addr_zero];\n\n";
+            $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+            $out .= "--:-:2:-:1      LDS.U.128 hidden0r, [loadHiddens];\n";
+            $out .= "--:-:3:-:1      LDS.U.128 hidden1r, [loadHiddens + 4x<4*16>];\n\n";
+        }
+        $out .= "--:-:-:-:1      LOP.XOR storeHiddens, storeHiddens, 4096;\n";
+
+        foreach my $shared_row (0 .. 15)
+        {
+            if($weight_index < 72)
+            {
+                if ($shared_row < 14 && ($k + (16 * ($shared_row + 2))) < $rowsize)
+                {
+                    my $read_bar = "-";
+                    if ($shared_row == 13 && ($k + 256) < $rowsize)
+                    {
+                        $read_bar = "5";
+                    }
+                    $out .= sprintf "--:%s:%d:-:1      LDS.U.128 hidden%dr, [loadHiddens + 4x<4*%d>];\n", $read_bar, $set_flag, $write_buffer, (16 * ($shared_row + 2));
+                }
+
+                if ($shared_row == 11)
+                {
+                    $out .= "--:-:-:-:1      IADD loadIndex, loadIndex, 256;\n";
+                    $out .= "20:-:-:-:1      IADD hprevAddr0.CC, hprevAddr0, ldh;\n";
+                }
+
+                if ($shared_row == 12)
+                {
+                    $out .= "--:-:-:-:1      ISETP.LT.AND P1, PT, loadIndex, param_rowSize, PT;\n";
+                    $out .= "--:-:-:-:1      IADD.X hprevAddr1,    hprevAddr1, RZ;\n";
+                }
+
+                if ($shared_row == 13)
+                {
+                    $out .= "01:-:-:-:1      STS.U.128 [storeHiddens], loadBuffer;\n";
+
+                    if (($k + 512) < $rowsize)
+                    {
+                        $out .= "--:6:1:-:1  \@P1 LDG.E.CI.128 loadBuffer, [hprevAddr];\n";
+                        $out .= "--:-:1:-:1 \@!P1 LDS.U.128    loadBuffer, [addr_zero];\n\n";
+                    }
+                    else
+                    {
+                        $out .= "--:-:-:-:6      IADD     hOffset,        rowOffset, warpTid;\n";
+                        $out .= "--:-:-:-:6      XMAD     hOffset,        hOffset,   param_ldh,  timeStep;\n";
+                        $out .= "--:-:-:-:6      LEA      hprevAddr0.CC,  hOffset,   param_h[0],      4;\n";
+                        $out .= "--:-:-:-:2      LEA.HI.X hprevAddr1,     hOffset,   param_h[1], RZ, 4;\n";
+                        $out .= "--:-:6:-:1 \@P0 LDG.E.CI.128 loadBuffer, [hprevAddr];\n\n";
+                    }
+                }
+
+                if ($shared_row == 14)
+                {
+                    $out .= "10:-:-:-:1      LOP.XOR loadHiddens, loadHiddens, 4096;\n";
+                    $out .= "--:-:-:-:5      BAR.SYNC 0;\n\n";
+                    $out .= sprintf "--:-:%d:-:1      LDS.U.128 hidden%dr, [loadHiddens];\n", $set_flag, $write_buffer;
+                }
+
+                if ($shared_row == 15)
+                {
+                    $out .= sprintf "--:-:%d:-:1      LDS.U.128 hidden%dr, [loadHiddens + 4x<4*16>];\n\n", $set_flag, $write_buffer;
+                }
+
+                foreach my $row (0 .. 2)
+                {
+                    my $weight = ($row * 72) + $weight_index;
+
+                    foreach my $col (0 .. 3)
+                    {
+                        my $accum = ($row * 4) + $col;
+                        my $wait = "--";
+                        my $stall = 1;
+                        if ($accum == 0)
+                        {
+                            if ($weight_index == 0)
+                            {
+                                $wait = sprintf "%02x", (0x20 | (1 << ($wait_flag - 1)));
+                            }
+                            else
+                            {
+                                $wait = sprintf "%02x", (1 << ($wait_flag - 1));
+                            }
+                        }
+
+                        if ($row == 2 && $col == 3)
+                        {
+                            if ($shared_row < 13 && ($k + (16 * ($shared_row + 3))) < $rowsize)
+                            {
+                                $stall = 0;
+                            }
+                            elsif ($shared_row == 14 && ($k + 256) < $rowsize)
+                            {
+                                $stall = 0;
+                            }
+                        }
+
+                        $out .= sprintf "%s:-:-:-:%d      FFMA accum%02d, weight%03d, hidden%dr%d, accum%02d;\n", $wait, $stall, $accum, $weight, $read_buffer, $col, $accum;
+                    }
+                }
+
+                $weight_index++;
+            }
+
+            $wait_flag += 1;
+            $set_flag += 1;
+            $read_buffer += 1;
+            $write_buffer += 1;
+            if($wait_flag == 5)
+            {
+                $wait_flag = 2;
+            }
+            if($set_flag == 5)
+            {
+                $set_flag = 2;
+            }
+            if($read_buffer == 3)
+            {
+                $read_buffer = 0;
+            }
+            if($write_buffer == 3)
+            {
+                $write_buffer = 0;
+            }
+        }
+    }
+
+    return $out;
+</CODE>
+
+//Reduction between threads
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 1, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 1, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 1, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 1, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 1, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 1, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 2, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 2, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 2, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 2, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 2, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 2, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 2, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 2, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 4, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 4, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 4, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 4, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 4, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 4, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 4, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 4, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V0, accum00, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V0, accum04, 8, 0x1f;
+--:-:1:-:1      SHFL.BFLY PT, peerR2V0, accum08, 8, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V1, accum01, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V1, accum05, 8, 0x1f;
+--:-:2:-:1      SHFL.BFLY PT, peerR2V1, accum09, 8, 0x1f;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V2, accum02, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V2, accum06, 8, 0x1f;
+--:-:3:-:1      SHFL.BFLY PT, peerR2V2, accum10, 8, 0x1f;
+
+--:-:-:-:1      SHFL.BFLY PT, peerR0V3, accum03, 8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, peerR1V3, accum07, 8, 0x1f;
+--:-:4:-:1      SHFL.BFLY PT, peerR2V3, accum11, 8, 0x1f;
+
+01:-:-:-:1      FADD accum00, accum00, peerR0V0;
+--:-:-:-:1      FADD accum04, accum04, peerR1V0;
+--:-:-:-:1      FADD accum08, accum08, peerR2V0;
+
+02:-:-:-:1      FADD accum01, accum01, peerR0V1;
+--:-:-:-:1      FADD accum05, accum05, peerR1V1;
+--:-:-:-:1      FADD accum09, accum09, peerR2V1;
+
+04:-:-:-:1      FADD accum02, accum02, peerR0V2;
+--:-:-:-:1      FADD accum06, accum06, peerR1V2;
+--:-:-:-:1      FADD accum10, accum10, peerR2V2;
+
+08:-:-:-:1      FADD accum03, accum03, peerR0V3;
+--:-:-:-:1      FADD accum07, accum07, peerR1V3;
+--:-:-:-:1      FADD accum11, accum11, peerR2V3;
+
+//Compute store pointer
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD     hRow,       rowOffset,  warpTid;
+--:-:-:-:1      XMAD     storeIndex, hRow,       param_ldh, timeStep;
+--:-:-:-:1      LEA      hAddr0.CC,  storeIndex, param_h[0],      4;
+--:-:-:-:1      LEA.HI.X hAddr1,     storeIndex, param_h[1], RZ, 4;
+--:-:-:-:1      LEA      lockAddr0,  timeStep,   param_lockAddr[0], 2;
+--:-:-:-:1      LEA.HI.X lockAddr1,  timeStep,   param_lockAddr[1], RZ, 2;
+
+//Conditional select for output
+--:-:-:-:1  @P2 MOV output0, accum00;
+--:-:-:-:1  @P3 MOV output0, accum04;
+--:-:-:-:1  @P4 MOV output0, accum08;
+
+--:-:-:-:1  @P2 MOV output1, accum01;
+--:-:-:-:1  @P3 MOV output1, accum05;
+--:-:-:-:1  @P4 MOV output1, accum09;
+
+--:-:-:-:1  @P2 MOV output2, accum02;
+--:-:-:-:1  @P3 MOV output2, accum06;
+--:-:-:-:1  @P4 MOV output2, accum10;
+
+--:-:-:-:1  @P2 MOV output3, accum03;
+--:-:-:-:1  @P3 MOV output3, accum07;
+--:-:-:-:3  @P4 MOV output3, accum11;
+
+//Update timestep
+--:-:-:-:1      ISETP.EQ.AND P5, PT, RZ, param_reverse, PT;
+--:-:-:-:1  @P5 MOV setVal, 1;
+--:-:-:-:1 @!P5 MOV setVal, -1;
+--:-:-:-:1  @P5 MOV expectVal, param_seqLength;
+--:-:-:-:1 @!P5 MOV expectVal, -1;
+--:-:-:-:1      IADD timeStep, timeStep, setVal;
+</SCHEDULE_BLOCK>
+
+//Save select predicates
+--:-:-:-:1      P2R predSave, PR, RZ, 0x0c;
+
+--:-:-:-:1      MOV reluclip, param_reluclip;
+
+//Add bias for output
+--:-:-:-:1      FADD output0, output0, biasValue;
+--:-:-:-:1      FADD output1, output1, biasValue;
+--:-:-:-:1      FADD output2, output2, biasValue;
+--:-:-:-:3      FADD output3, output3, biasValue;
+
+//Accumulate on top of current data
+20:-:-:-:1      FADD output0, output0, loadBuffer0;
+--:-:-:-:1      FADD output1, output1, loadBuffer1;
+--:-:-:-:1      FADD output2, output2, loadBuffer2;
+--:-:-:-:3      FADD output3, output3, loadBuffer3;
+
+//Activation function
+//TODO: add others
+--:-:-:-:2  FMNMX output0, output0, RZ, !PT;
+--:-:-:-:2  FMNMX output1, output1, RZ, !PT;
+--:-:-:-:2  FMNMX output2, output2, RZ, !PT;
+--:-:-:-:2  FMNMX output3, output3, RZ, !PT;
+
+--:-:-:-:2  FMNMX output0, output0, reluclip, PT;
+--:-:-:-:2  FMNMX output1, output1, reluclip, PT;
+--:-:-:-:2  FMNMX output2, output2, reluclip, PT;
+--:-:-:-:2  FMNMX output3, output3, reluclip, PT;
+
+//Conditional store
+--:-:-:-:1  @P0 STG.E.CI.128 [hAddr], output;
+
+//Compute predicate for time unrolling loop
+--:-:-:Y:d      ISETP.NE.AND P5, PT, timeStep, expectVal, PT;
+
+//P2 = (tid != 0)
+//setVal = 1
+--:-:-:-:1      ISETP.NE.AND P2, PT, tid, RZ, PT;
+--:-:-:-:1      MOV expectVal, param_numBlks;
+--:-:-:Y:b      MOV setVal, 1;
+
+//Barrier for all blocks
+--:-:-:-:f      MEMBAR.GL;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:2      SSY SSY_TARGET1;
+--:-:-:-:d  @P2 SYNC;
+
+--:-:-:Y:2      ATOM.E.ADD RZ, [lockAddr], setVal;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET1:
+--:-:-:-:1      SSY SSY_TARGET2;
+--:-:-:-:d  @P2 SYNC;
+
+SPINLOCK:
+--:-:1:Y:2      LDG.E lockVal, [lockAddr];
+01:-:-:Y:d      ISETP.NE.AND P2, PT, lockVal, expectVal, PT;
+--:-:-:-:5  @P2 BRA.U SPINLOCK;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET2:
+--:-:-:-:5      BAR.SYNC 0;
+
+//Restore select predicates
+--:-:-:-:1      R2P PR, predSave, 0x0c;
+
+//Conditional branch back to beginning of loop
+--:-:-:Y:5  @P5 BRA.U UNROLLING_LOOP;
+
+--:-:-:-:5      EXIT;
diff --git a/Kernel/Convolution/Pascal/sconv_bprop_C1_N64.sass b/Kernel/Convolution/Pascal/sconv_bprop_C1_N64.sass
new file mode 100644
index 0000000..070db8c
--- /dev/null
+++ b/Kernel/Convolution/Pascal/sconv_bprop_C1_N64.sass
@@ -0,0 +1,600 @@
+# Kernel: sconv_bprop_C32_N64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_lut : 4x<64*4>
+
+    param_I[0]         : c[0x0][0x140]
+    param_I[1]         : c[0x0][0x144]
+    param_E[0]         : c[0x0][0x148]
+    param_E[1]         : c[0x0][0x14c]
+    param_F[0]         : c[0x0][0x150]
+    param_F[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_N            : c[0x0][0x15c]
+    param_K            : c[0x0][0x160]
+    param_D            : c[0x0][0x164]
+    param_H            : c[0x0][0x168]
+    param_W            : c[0x0][0x16c]
+    param_WN           : c[0x0][0x170]
+    param_HWN          : c[0x0][0x174]
+    param_DHWN         : c[0x0][0x178]
+    param_C            : c[0x0][0x17c]
+    param_CRST         : c[0x0][0x180]
+    param_RST          : c[0x0][0x184]
+    param_magic_RST    : c[0x0][0x188]
+    param_shift_RST    : c[0x0][0x18c]
+    param_RS           : c[0x0][0x190]
+    param_magic_RS     : c[0x0][0x194]
+    param_shift_RS     : c[0x0][0x198]
+    param_S            : c[0x0][0x19c]
+    param_magic_S      : c[0x0][0x1a0]
+    param_shift_S      : c[0x0][0x1a4]
+    param_pad_d        : c[0x0][0x1a8]
+    param_pad_h        : c[0x0][0x1ac]
+    param_pad_w        : c[0x0][0x1b0]
+    param_str_d        : c[0x0][0x1b4]
+    param_str_h        : c[0x0][0x1b8]
+    param_str_w        : c[0x0][0x1bc]
+    param_Q            : c[0x0][0x1c0]
+    param_PQ           : c[0x0][0x1c4]
+    param_QN           : c[0x0][0x1c8]
+    param_PQN          : c[0x0][0x1cc]
+    param_MPQN         : c[0x0][0x1d0]
+    param_magic_Q      : c[0x0][0x1d4]
+    param_shift_Q      : c[0x0][0x1d8]
+    param_magic_PQ     : c[0x0][0x1dc]
+    param_shift_PQ     : c[0x0][0x1e0]
+    param_CRST8        : c[0x0][0x1e4]
+    param_MPQN8        : c[0x0][0x1e8]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-67 ~ tid, blkE, blkF, blkMPQ
+
+     68-119 ~ k<0|4>, tidX, tid1, m, p, q, crst, n, n32, tf<0|4>, te, te<0|4>, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+      64-79 : j0Ex<0-7>, j0Fy<0-7>
+      80-95 : j1Ex<0-7>, j1Fy<0-7>
+
+     96-103 : load0F<0-3>, load4F<0-3>
+    104-119 : load0E<0-7>, load4E<0-7>
+
+    120-123 : track0F<0-1>, track4F<0-1>
+    124-127 : track0E<0-1>, track4E<0-1>
+
+    128-131 ~ writeEs, writeFs, swapBuf, K
+    132-136 ~ readEs, readFs, mt, pr, qs
+
+     68-71  ~ lutStore, sliceI
+     72-132 ~ warp_cnt, rst, rs, t, r, s, x, y, z, x0, xW, y0, yH, z0, zD
+
+     72-89  : c<0-7>, trackI<0-1>, track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1>
+     90-132 ~ crst<00|04|08|12>, c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>, writeCs, readCs, RST, DHWN1, alpha, nn, tid31
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,    SR_TID.X;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkF,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+// tidX  = (tid & 7) << 2
+// k     = tid >> 3
+01:-:-:-:1      LOP.AND tidX, tid,  7;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 k0,   tid,  3;
+--:-:-:-:1      IADD    k4,   k0,   4;
+
+--:-:-:-:1      MOV K, param_K;
+
+--:-:-:-:1      STS.128 [RZ], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [RZ];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+08:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+
+// mt = m * w - pad_d
+// pr = p * u - pad_h
+// qs = q * v - pad_w
+--:-:-:-:1      XMAD mt, m,   param_str_d, RZ;
+--:-:-:-:1      XMAD pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD qs, q,   param_str_w, RZ;
+--:-:-:-:1      IADD mt, mt, -param_pad_d;
+--:-:-:-:1      IADD pr, pr, -param_pad_h;
+--:-:-:-:1      IADD qs, qs, -param_pad_w;
+
+// crst = blkF*32 + tidX
+// n    = blkE*64 + tidX
+04:-:-:-:1      ISCADD crst, blkF, tidX, 5;
+08:-:-:-:1      ISCADD n,    blkE, tidX, 6;
+--:-:-:-:1      IADD   n32,  n,    32;
+
+// trackF = k*CRST + crst
+--:-:-:-:1      XMAD     tf0, k0, param_CRST, crst;
+--:-:-:-:1      XMAD     tf4, k4, param_CRST, crst;
+--:-:-:-:1      LEA      track0F0.CC, tf0, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X track0F1,    tf0, param_F[1], RZ, 2;
+--:-:-:-:1      LEA      track4F0.CC, tf4, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X track4F1,    tf4, param_F[1], RZ, 2;
+
+// trackE = k*MPQN + m*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD      te,  q,  param_N,    n;
+--:-:-:-:1      XMAD.LO2C te,  p,  param_QN,   te;
+--:-:-:-:1      XMAD.LO2C te,  m,  param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te0, k0, param_MPQN, te;
+--:-:-:-:1      XMAD.LO2C te4, k4, param_MPQN, te;
+--:-:-:-:1      LEA       track0E0.CC, te0, param_E[0],     2;
+--:-:-:-:1      LEA.HI.X  track0E1,    te0, param_E[1], RZ, 2;
+--:-:-:-:1      LEA       track4E0.CC, te4, param_E[0],     2;
+--:-:-:-:1      LEA.HI.X  track4E1,    te4, param_E[1], RZ, 2;
+
+// P1 = crst < CRST
+// P2 = n    < N
+// P3 = n+32 < N
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst, param_CRST, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, n,    param_N,    PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, n32,  param_N,    PT;
+
+// writeFs = (32*k + tidX) * 4
+--:-:-:-:1      ISCADD  writeFs, k0, tidX, 5;
+--:-:-:-:1      SHL     writeFs, writeFs,  2;
+// writeEs = (64*k + tidX) * 4
+--:-:-:-:1      ISCADD  writeEs, k0, tidX, 6;
+--:-:-:-:1      ISCADD  writeEs, writeEs, 4x<32*8>, 2;
+
+// readFs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readEs = ((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<32*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<32*8 + 64*8>;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD K, K, -8;
+
+--:-:-:-:1  @P1 LDG.E.CI load0F0, [track0F + 4x<0>];
+--:-:-:-:1  @P1 LDG.E.CI load0F1, [track0F + 4x<1>];
+--:-:-:-:1  @P1 LDG.E.CI load0F2, [track0F + 4x<2>];
+--:-:1:-:1  @P1 LDG.E.CI load0F3, [track0F + 4x<3>];
+
+--:-:-:-:1  @P1 LDG.E.CI load4F0, [track4F + 4x<0>];
+--:-:-:-:1  @P1 LDG.E.CI load4F1, [track4F + 4x<1>];
+--:-:-:-:1  @P1 LDG.E.CI load4F2, [track4F + 4x<2>];
+--:-:2:-:1  @P1 LDG.E.CI load4F3, [track4F + 4x<3>];
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, K, RZ, P1;
+
+--:-:3:-:1  @P2 LDG.E.128 load0E0, [track0E + 4x< 0>];
+--:-:4:-:1  @P3 LDG.E.128 load0E4, [track0E + 4x<32>];
+--:-:5:-:1  @P2 LDG.E.128 load4E0, [track4E + 4x< 0>];
+--:-:6:-:1  @P3 LDG.E.128 load4E4, [track4E + 4x<32>];
+
+--:-:-:-:2      ISETP.GT.AND P2, PT, K, RZ, P2;
+--:-:-:-:0      ISETP.GT.AND P3, PT, K, RZ, P3;
+
+01:-:-:-:1      STS.128 [writeFs + 4x<0*32>], load0F;
+--:-:-:-:6      IADD   track0F0.CC, track0F0, param_CRST8;
+--:-:-:-:0      IADD.X track0F1,    track0F1, RZ;
+
+02:-:-:-:1      STS.128 [writeFs + 4x<4*32>], load4F;
+--:-:-:-:6      IADD   track4F0.CC, track4F0, param_CRST8;
+--:-:-:-:0      IADD.X track4F1,    track4F1, RZ;
+
+04:-:-:-:1      STS.128 [writeEs + 4x<0*64 +  0>], load0E0;
+08:-:-:-:1      STS.128 [writeEs + 4x<0*64 + 32>], load0E4;
+--:-:-:-:6      IADD   track0E0.CC, track0E0, param_MPQN8;
+--:-:-:-:0      IADD.X track0E1,    track0E1, RZ;
+
+10:-:-:-:1      STS.128 [writeEs + 4x<4*64 +  0>], load4E0;
+20:1:-:-:1      STS.128 [writeEs + 4x<4*64 + 32>], load4E4;
+--:-:-:-:6      IADD   track4E0.CC, track4E0, param_MPQN8;
+--:-:-:-:1      IADD.X track4E1,    track4E1, RZ;
+
+01:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
+--:-:-:-:2      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD K, K, -8;
+
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*32 + 16>];
+
+--:-:-:-:1  @P1 LDG.E.CI load0F0, [track0F + 4x<0>];
+--:-:-:-:1  @P1 LDG.E.CI load0F1, [track0F + 4x<1>];
+--:-:-:-:1  @P1 LDG.E.CI load0F2, [track0F + 4x<2>];
+--:-:2:-:1  @P1 LDG.E.CI load0F3, [track0F + 4x<3>];
+
+--:-:-:-:1  @P1 LDG.E.CI load4F0, [track4F + 4x<0>];
+--:-:-:-:1  @P1 LDG.E.CI load4F1, [track4F + 4x<1>];
+--:-:-:-:1  @P1 LDG.E.CI load4F2, [track4F + 4x<2>];
+--:-:3:-:1  @P1 LDG.E.CI load4F3, [track4F + 4x<3>];
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, K, RZ, P1;
+
+--:-:4:-:1  @P2 LDG.E.128 load0E0, [track0E + 4x< 0>];
+--:-:4:-:1  @P3 LDG.E.128 load0E4, [track0E + 4x<32>];
+--:-:5:-:1  @P2 LDG.E.128 load4E0, [track4E + 4x< 0>];
+--:-:5:-:1  @P3 LDG.E.128 load4E4, [track4E + 4x<32>];
+
+--:-:-:-:2      ISETP.GT.AND P2, PT, K, RZ, P2;
+--:-:-:-:1      ISETP.GT.AND P3, PT, K, RZ, P3;
+
+NEXT_8K:
+--:-:-:-:1      ISETP.GT.AND P0, PT, K, -8, PT;
+<CODE>
+    my %insert =
+    (
+        j0c8  => "--:-:-:-:1      IADD K, K, -8;\n",
+
+        j0c12 => "02:2:-:-:1  \@P0 STS.128 [writeFs + 4x<0*32>], load0F;\n",
+        j0c14 => "--:-:-:-:1  \@P0 IADD   track0F0.CC, track0F0, param_CRST8;\n",
+        j0c19 => "--:-:-:-:1  \@P0 IADD.X track0F1,    track0F1, RZ;\n",
+
+        j0c56 => "02:-:-:-:1  \@P1 LDG.E.CI load0F0, [track0F + 4x<0>];\n",
+        j0c58 => "--:-:-:-:1  \@P1 LDG.E.CI load0F1, [track0F + 4x<1>];\n",
+        j0c60 => "--:-:-:-:1  \@P1 LDG.E.CI load0F2, [track0F + 4x<2>];\n",
+        j0c62 => "--:-:2:-:1  \@P1 LDG.E.CI load0F3, [track0F + 4x<3>];\n",
+
+        j2c12 => "04:3:-:-:1  \@P0 STS.128 [writeFs + 4x<4*32>], load4F;\n",
+        j2c14 => "--:-:-:-:1  \@P0 IADD   track4F0.CC, track4F0, param_CRST8;\n",
+        j2c19 => "--:-:-:-:1  \@P0 IADD.X track4F1,    track4F1, RZ;\n",
+
+        j2c56 => "04:-:-:-:1  \@P1 LDG.E.CI load4F0, [track4F + 4x<0>];\n",
+        j2c58 => "--:-:-:-:1  \@P1 LDG.E.CI load4F1, [track4F + 4x<1>];\n",
+        j2c60 => "--:-:-:-:1  \@P1 LDG.E.CI load4F2, [track4F + 4x<2>];\n",
+        j2c62 => "--:-:3:-:1  \@P1 LDG.E.CI load4F3, [track4F + 4x<3>];\n",
+
+        j4c12 => "08:-:-:-:1  \@P0 STS.128 [writeEs + 4x<0*64 +  0>], load0E0;\n",
+        j4c14 => "--:4:-:-:1  \@P0 STS.128 [writeEs + 4x<0*64 + 32>], load0E4;\n",
+        j4c16 => "--:-:-:-:1  \@P0 IADD   track0E0.CC, track0E0, param_MPQN8;\n",
+        j4c21 => "--:-:-:-:1  \@P0 IADD.X track0E1,    track0E1, RZ;\n",
+
+        j4c60 => "08:-:-:-:1  \@P2 LDG.E.128 load0E0, [track0E + 4x< 0>];\n",
+        j4c62 => "--:-:4:-:1  \@P3 LDG.E.128 load0E4, [track0E + 4x<32>];\n",
+
+        j6c12 => "10:-:-:-:1  \@P0 STS.128 [writeEs + 4x<4*64 +  0>], load4E0;\n",
+        j6c14 => "--:5:-:-:1  \@P0 STS.128 [writeEs + 4x<4*64 + 32>], load4E4;\n",
+        j6c16 => "--:-:-:-:1  \@P0 IADD   track4E0.CC, track4E0, param_MPQN8;\n",
+        j6c21 => "--:-:-:-:1  \@P0 IADD.X track4E1,    track4E1, RZ;\n",
+
+        j6c60 => "10:-:-:-:1  \@P2 LDG.E.128 load4E0, [track4E + 4x< 0>];\n",
+        j6c62 => "--:-:5:-:1  \@P3 LDG.E.128 load4E4, [track4E + 4x<32>];\n",
+
+        j6c63 => "--:-:-:-:1  \@P0 IADD readEs,  readEs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeEs, writeEs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c8  => "--:-:-:-:1      ISETP.GT.AND P1, PT, K, RZ, P1;\n",
+        j7c10 => "--:-:-:-:1      ISETP.GT.AND P2, PT, K, RZ, PT;\n",
+        j7c12 => "--:-:-:-:1      ISETP.GT.AND P3, PT, K, RZ, PT;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_8K;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:0      MOV warp_cnt, 32;
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkF, SR_CTAID.Y;
+--:-:3:-:1      S2R blkE, SR_CTAID.Z;
+01:-:-:-:6      MOV rst,  tid;
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_cnt < RST (c=0)
+--:-:-:-:1      ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT;
+--:-:-:-:1      IADD warp_cnt, warp_cnt, 32;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = qs + s
+// y = pr + r
+// z = mt + t
+--:-:-:-:1      IADD z, mt, t;
+--:-:-:-:1      IADD y, pr, r;
+--:-:-:-:1      IADD x, qs, s;
+// i = (z*HWN + y*WN + x*N) * 4
+20:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD      sliceI, x, param_N,   sliceI;
+--:-:-:-:1      SHL       sliceI, sliceI, 2;
+// Bounds check x and y, and make i negative if outside
+--:-:-:-:1      ISET.LT.AND x0, x, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW, x,  param_W, PT;
+--:-:-:-:1      ISET.LT.AND y0, y, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH, y,  param_H, PT;
+--:-:-:-:1      ISET.LT.AND z0, z, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD, z,  param_D, PT;
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, x0, xW, 0xfe;
+<ORDERED>
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, y0, yH, 0xfe;
+--:-:-:-:1      SHL lutStore, rst, 2;
+--:-:-:-:1      IADD rst, rst, 32;
+</ORDERED>
+--:-:-:-:1      LOP3.LUT sliceI, sliceI, z0, zD, 0xfe;
+// Store i imgOffset into the shared lookup table
+--:6:-:-:1      STS [lutStore + addr_lut], sliceI;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U LUT_LOOP;
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV RST,       param_RST;
+--:-:-:-:1      MOV DHWN1,     param_DHWN;
+--:-:-:-:1      SHL DHWN1,     DHWN1, 2;
+
+--:-:-:-:1      LOP.AND readEs, readEs, 0x7f;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x3f;
+
+// writeCs = ((readIs / 4) * 64 + readEs);
+--:-:-:-:1      ISCADD  writeCs, readFs, readEs, 4;
+
+// readCs = (tid & 31) << 2;
+--:-:-:-:1      LOP.AND tid31,  tid,   31;
+--:-:-:-:1      SHL     readCs, tid31, 2;
+
+// nn = blkE*64 + tid31;
+04:-:-:-:1      ISCADD nn, blkE, tid31, 6;
+
+// crst = blkF*32
+02:-:-:-:1      SHL  crst00, blkF,   5;
+--:-:-:-:1      IADD crst04, crst00, 4;
+--:-:-:-:1      IADD crst08, crst00, 8;
+--:-:-:-:1      IADD crst12, crst00, 12;
+
+--:-:-:-:1      LEA      trackI0.CC, nn, param_I[0],     2;
+--:-:-:-:1      LEA.HI.X trackI1,    nn, param_I[1], RZ, 2;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P5, PT, nn, param_N, PT;
+--:-:-:-:1      IADD nn, nn, 32;
+--:-:-:-:1      ISETP.LT.AND P6, PT, nn, param_N, PT;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:1      IADD crst00, crst00, 12;\n" .
+            "--:-:-:-:1      IADD crst04, crst04, 12;\n" .
+            "--:-:-:-:1      IADD crst08, crst08, 12;\n" .
+            "--:-:-:-:1      IADD crst12, crst12, 12;\n" if $y == 4;
+
+        $out .= sprintf(
+            "01:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "02:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "04:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "08:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], c4;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS c1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<1*64 + 00>];
+--:-:-:-:1      LDS c3, [readCs + 4x<1*64 + 32>];
+--:-:-:-:1      LDS c4, [readCs + 4x<2*64 + 00>];
+--:-:-:-:1      LDS c5, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      LDS c6, [readCs + 4x<3*64 + 00>];
+--:-:-:-:1      LDS c7, [readCs + 4x<3*64 + 32>];
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5;
+
+--:-:-:-:1      XMAD.LO2C c00, crst00, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c04, crst04, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c08, crst08, param_magic_RST, RZ;
+--:-:-:-:1      XMAD.LO2C c12, crst12, param_magic_RST, RZ;
+
+--:-:-:-:1      SHR.U32 c00, c00, param_shift_RST;
+--:-:-:-:1      SHR.U32 c04, c04, param_shift_RST;
+--:-:-:-:1      SHR.U32 c08, c08, param_shift_RST;
+--:-:-:-:1      SHR.U32 c12, c12, param_shift_RST;
+
+--:-:-:-:1      VMAD.U16.U16 lut00, -c00, RST, crst00;
+--:-:-:-:1      VMAD.U16.U16 lut04, -c04, RST, crst04;
+--:-:-:-:1      VMAD.U16.U16 lut08, -c08, RST, crst08;
+--:-:-:-:1      VMAD.U16.U16 lut12, -c12, RST, crst12;
+
+--:-:-:-:1      SHL lut00, lut00, 2;
+--:-:-:-:1      SHL lut04, lut04, 2;
+--:-:-:-:1      SHL lut08, lut08, 2;
+--:-:-:-:1      SHL lut12, lut12, 2;
+
+--:-:-:-:1      XMAD.LO2 chan00, DHWN1, c00, RZ;
+--:-:-:-:1      XMAD.LO2 chan04, DHWN1, c04, RZ;
+--:-:-:-:1      XMAD.LO2 chan08, DHWN1, c08, RZ;
+--:-:-:-:1      XMAD.LO2 chan12, DHWN1, c12, RZ;
+
+--:-:-:-:1      IADD crst00, crst00, 1;
+--:-:-:-:1      IADD crst04, crst04, 1;
+--:-:-:-:1      IADD crst08, crst08, 1;
+--:-:-:-:1      IADD crst12, crst12, 1;
+
+--:-:1:-:1  @P0 LDS img00, [lut00 + addr_lut];
+--:-:2:-:1  @P1 LDS img04, [lut04 + addr_lut];
+--:-:3:-:1  @P2 LDS img08, [lut08 + addr_lut];
+--:-:4:-:1  @P3 LDS img12, [lut12 + addr_lut];
+
+</SCHEDULE_BLOCK>
+
+01:-:-:-:1      IADD3  track00I0.CC, trackI0, img00, chan00;
+--:-:-:-:5      ISETP.GE.AND P0, PT, img00, RZ, P0;
+--:-:-:-:1      IADD.X track00I1,    trackI1, RZ;
+
+02:-:-:-:1      IADD3  track04I0.CC, trackI0, img04, chan04;
+--:-:-:-:5      ISETP.GE.AND P1, PT, img04, RZ, P1;
+--:-:-:-:1      IADD.X track04I1,    trackI1, RZ;
+
+04:-:-:-:1      IADD3  track08I0.CC, trackI0, img08, chan08;
+--:-:-:-:5      ISETP.GE.AND P2, PT, img08, RZ, P2;
+--:-:-:-:1      IADD.X track08I1,    trackI1, RZ;
+
+08:-:-:-:1      IADD3  track12I0.CC, trackI0, img12, chan12;
+--:-:-:-:5      ISETP.GE.AND P3, PT, img12, RZ, P3;
+--:-:-:-:0      IADD.X track12I1,    trackI1, RZ;
+
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00I], c0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04I], c2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08I], c4;
+--:-:-:-:3      PSETP.AND.AND P2, PT, P2, P6, PT;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12I], c6;
+--:-:-:-:5      PSETP.AND.AND P3, PT, P3, P6, PT;
+
+--:1:-:-:2  @P0 RED.E.ADD.F32.FTZ.RN [track00I + 4x<32>], c1;
+--:2:-:-:2  @P1 RED.E.ADD.F32.FTZ.RN [track04I + 4x<32>], c3;
+--:3:-:-:4  @P2 RED.E.ADD.F32.FTZ.RN [track08I + 4x<32>], c5;
+--:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12I + 4x<32>], c7;
+
+--:-:-:-:5      RET;
+
diff --git a/Kernel/Convolution/Pascal/sconv_updat_C128_K128.sass b/Kernel/Convolution/Pascal/sconv_updat_C128_K128.sass
new file mode 100644
index 0000000..dfb6bea
--- /dev/null
+++ b/Kernel/Convolution/Pascal/sconv_updat_C128_K128.sass
@@ -0,0 +1,718 @@
+# Kernel: sconv_updat_C128_K128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*4 + 0>
+    addr_m    : 4x<(128*16 + 32)*4 + 4>
+    addr_q    : 4x<(128*16 + 32)*4 + 5>
+    szBuf     : (128*16 + 32)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_offset_K     : c[0x0][0x15c]
+    param_N            : c[0x0][0x160]
+    param_K            : c[0x0][0x164]
+    param_D            : c[0x0][0x168]
+    param_H            : c[0x0][0x16c]
+    param_W            : c[0x0][0x170]
+    param_WN           : c[0x0][0x174]
+    param_HWN          : c[0x0][0x178]
+    param_DHWN         : c[0x0][0x17c]
+    param_C            : c[0x0][0x180]
+    param_CRST         : c[0x0][0x184]
+    param_RST          : c[0x0][0x188]
+    param_magic_RST    : c[0x0][0x18c]
+    param_shift_RST    : c[0x0][0x190]
+    param_RS           : c[0x0][0x194]
+    param_magic_RS     : c[0x0][0x198]
+    param_shift_RS     : c[0x0][0x19c]
+    param_S            : c[0x0][0x1a0]
+    param_magic_S      : c[0x0][0x1a4]
+    param_shift_S      : c[0x0][0x1a8]
+    param_pad_d        : c[0x0][0x1ac]
+    param_pad_h        : c[0x0][0x1b0]
+    param_pad_w        : c[0x0][0x1b4]
+    param_str_d        : c[0x0][0x1b8]
+    param_str_h        : c[0x0][0x1bc]
+    param_str_w        : c[0x0][0x1c0]
+    param_dil_d        : c[0x0][0x1c4]
+    param_dil_h        : c[0x0][0x1c8]
+    param_dil_w        : c[0x0][0x1cc]
+    param_P            : c[0x0][0x1d0]
+    param_Q            : c[0x0][0x1d4]
+    param_PQ           : c[0x0][0x1d8]
+    param_QN           : c[0x0][0x1dc]
+    param_PQN          : c[0x0][0x1e0]
+    param_MPQN         : c[0x0][0x1e4]
+    param_magic_Q      : c[0x0][0x1e8]
+    param_shift_Q      : c[0x0][0x1ec]
+    param_magic_PQ     : c[0x0][0x1f0]
+    param_shift_PQ     : c[0x0][0x1f4]
+    param_grid_P       : c[0x0][0x1f8]
+    param_grid_Q       : c[0x0][0x1fc]
+    param_grid_PQ      : c[0x0][0x200]
+    param_CRSTK        : c[0x0][0x204]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-67   ~ tid, blkI, blkE, one
+    68-111  ~ tidX, tidY, tid1, tid7, tid128, shiftX, blkMPQ, m, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+    64-95   ~ tidYY, mm, mt, pr, y, z, y0, yH, z0, zD, bounds_yz, c, r, t, rs, rst
+    64-95   ~ qs, x, x0, xW, bounds_x, ti, te, Q
+
+    64-79   : j0Ex<0-7>, j0Iy<0-7>
+    80-95   : j1Ex<0-7>, j1Iy<0-7>
+
+    96-111  : loadI<0-7>,  loadE<0-7>
+    112-115 : trackI<0-1>, trackE<0-1>
+
+    116-124 ~ writeS, loopN, e, i, p, q, k, crst, s
+    125-127 ~ swapBuf, readIs, readEs
+
+     68-83  : c<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+    84-124  ~ writeCs, readCs, K1, K60, crst<00|04|08|12>, alpha, K, K4, tid31, tid96, kk, tf, t128, blk_MPQ, CRSTK, xmad_determ
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID1;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkI,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+--:-:-:-:5      BRA.U END_CTAID1;
+CTAID1:
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.Z;
+--:-:3:-:1      S2R blkI,   SR_CTAID.X;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Y;
+END_CTAID1:
+
+<SCHEDULE_BLOCK>
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 2
+// shiftX = (tid & 1) << 4
+01:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tid1, 2;
+--:-:-:-:1      SHL     shiftX, tid1, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      PSETP.AND.AND P0, PT, PT, PT, PT;
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_grid_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_grid_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+
+// We need to be able to restore m and q at each P iteration
+// Register spill to shared
+--:1:-:-:1      STS [addr_m], m;
+--:-:-:-:1      STS [addr_q], q;
+
+// writeBs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeS, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeS, writeS, shiftX;
+--:-:-:-:1      ISCADD writeS, writeS, 4x<szBuf * 2>, 2;
+
+// readIs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readIs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readEs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + szBuf;
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readEs, tid128, 4;
+--:-:-:-:1      LOP.OR  readEs, readEs, tid7;
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<szBuf>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szBuf * 2>;
+
+// crst = blockI*128 + tid
+04:-:-:-:1      ISCADD crst, blkI, tidX, 7;
+
+// k = blockE*128 + tid
+08:-:-:-:1      ISCADD k, blkE, tidX, 7;
+--:-:-:-:1      IADD   k, k, param_offset_K;
+
+--:-:-:-:1      MOV loopN, param_N;
+
+</SCHEDULE_BLOCK>
+
+NEXT_P:
+
+01:-:4:-:1      S2R tidYY, SR_TID.X;
+--:-:5:-:1      LDS mm, [addr_m];
+
+<SCHEDULE_BLOCK>
+--:-:6:-:1      LDS q, [addr_q];
+
+// c   = crst / RST
+// rst = crst % RST
+--:-:-:-:1      XMAD.LO2C c, crst, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32   c, c, param_shift_RST;
+--:-:-:-:1      XMAD rst, c, param_RST, RZ;
+--:-:-:-:1      IADD rst, -rst, crst;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// y = p * u - pad_h + (r * dil_h)
+// z = m * w - pad_d + (t * dil_d)
+--:-:-:-:1      XMAD  pr, p,   param_str_h, RZ;
+10:-:-:-:1      XMAD  mt, mm,  param_str_d, RZ;
+--:-:-:-:1      XMAD  y,  r,   param_dil_h, pr;
+--:-:-:-:1      XMAD  z,  t,   param_dil_d, mt;
+--:-:-:-:1      IADD  y,  y,  -param_pad_h;
+--:-:-:-:1      IADD  z,  z,  -param_pad_d;
+// e = k*MPQN + m*PQN + p*QN + tidYY
+08:-:-:-:1      LOP.AND tidYY, tidYY, 1;
+--:-:-:-:1      SHL     tidYY, tidYY, 2;
+--:-:-:-:1      XMAD.LO2C e, p,  param_QN,   tidYY;
+--:-:-:-:1      XMAD.LO2C e, mm, param_PQN,  e;
+--:-:-:-:1      XMAD.LO2C e, k,  param_MPQN, e;
+// i = c*DHWN + z*HWN + y*WN + tidYY
+--:-:-:-:1      XMAD.LO2C i, y, param_WN,   tidYY;
+--:-:-:-:1      XMAD.LO2C i, z, param_HWN,  i;
+--:-:-:-:1      XMAD.LO2C i, c, param_DHWN, i;
+// bounds_yz = y < 0 || y > H || z < 0 || z > D ? -1 : 0
+--:-:-:-:1      ISET.LT.AND y0, y,  RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH, y,  param_H, PT;
+--:-:-:-:1      ISET.LT.AND z0, z,  RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD, z,  param_D, PT;
+--:-:-:-:1      LOP.OR   bounds_yz, y0, yH;
+--:-:-:-:1      LOP3.LUT bounds_yz, bounds_yz, z0, zD, 0xfe;
+// doLoadCRST = crst < CRST && bounds_yz == 0
+--:-:-:-:1      ISETP.LT.AND P4, PT, crst, param_CRST, PT;
+--:-:-:-:1      ISETP.EQ.AND P4, PT, bounds_yz, RZ, P4;
+// p += grid_P
+--:-:-:-:1      IADD p, p, param_grid_P;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, p, param_P, PT;
+</SCHEDULE_BLOCK>
+
+NEXT_Q:
+
+<SCHEDULE_BLOCK>
+// Zigzag q but only if grid_P < P
+--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
+--:-:-:-:1      MOV Q, param_grid_P;
+--:-:-:-:1      ISETP.LT.AND P1, PT, Q, param_P, P1;
+--:-:-:-:1      MOV Q, -1;
+20:-:-:-:1  @P1 IADD3 Q, -q, param_Q, Q;
+--:-:-:-:1 @!P1 MOV Q, q;
+// k < K
+--:-:-:-:1      ISETP.LT.AND P3, PT, k, param_K, PT;
+// qs = q * v - pad_w
+// x = qs + (s dil_w)
+--:-:-:-:1      XMAD  qs, Q,  param_str_w, RZ;
+--:-:-:-:1      XMAD  x,  s,  param_dil_w, qs;
+--:-:-:-:1      IADD  x,  x, -param_pad_w;
+// bounds_x = x < 0 || x > W ? -1 : 0
+--:-:-:-:1      ISET.LT.AND x0, x, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW, x,  param_W, PT;
+--:-:-:-:1      LOP.OR bounds_x, x0, xW;
+// doLoad = crst < CRST && bounds_yz == 0 && bounds_x == 0
+--:-:-:-:1      ISETP.EQ.AND P2, PT, bounds_x, RZ, P4;
+// trackI = I + i + x*N
+--:-:-:-:1      XMAD ti, x, param_N, i;
+--:-:-:-:1      LEA      trackI0.CC, ti, param_I[0],     2;
+--:-:-:-:1      LEA.HI.X trackI1,    ti, param_I[1], RZ, 2;
+// trackE = E + e + q*N
+--:-:-:-:1      XMAD te, Q, param_N, e;
+--:-:-:-:1      LEA      trackE0.CC, te, param_E[0],     2;
+--:-:-:-:0      LEA.HI.X trackE1,    te, param_E[1], RZ, 2;
+// q += grid_Q
+--:-:-:-:1      IADD q, q, param_grid_Q;
+--:-:-:-:1      ISETP.LT.AND P5, PT, q, param_Q, PT;
+
+--:-:-:-:1 @!P0 IADD loopN, loopN, param_N;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:6 @!P0 BRA.U NEXT_PQ;
+
+--:-:-:-:0      PSETP.AND.AND P0, PT, PT, PT, !PT;
+
+--:-:1:-:1  @P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>];
+--:-:2:-:1  @P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>];
+--:-:-:-:1 @!P2 LDS.U.128 loadI0, [addr_zero];
+--:-:5:-:1 @!P2 LDS.U.128 loadI4, [addr_zero];
+
+--:-:-:-:0      ISETP.LE.AND P1, PT, loopN, 32, PT;
+
+--:-:3:-:1  @P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>];
+--:-:4:-:1  @P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>];
+--:-:-:-:1 @!P3 LDS.U.128 loadE0, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 loadE4, [addr_zero];
+
+11:-:-:-:1      STS [writeS + 4x< 0*128>], loadI0;
+--:-:-:-:1      STS [writeS + 4x< 1*128>], loadI1;
+--:-:-:-:1      STS [writeS + 4x< 2*128>], loadI2;
+--:-:-:-:1      STS [writeS + 4x< 3*128>], loadI3;
+
+02:-:-:-:1      STS [writeS + 4x< 8*128 + 16>], loadI4;
+--:-:-:-:1      STS [writeS + 4x< 9*128 + 16>], loadI5;
+--:-:-:-:1      STS [writeS + 4x<10*128 + 16>], loadI6;
+--:-:-:-:1      STS [writeS + 4x<11*128 + 16>], loadI7;
+
+--:-:-:-:1      IADD   trackI0.CC, trackI0, 4x<16>;
+--:-:-:-:0      PSETP.AND.AND P5, PT, P1, P5, PT;
+
+24:-:-:-:1      STS [writeS + 4x< 0*128 + szBuf>], loadE0;
+--:-:-:-:1      STS [writeS + 4x< 1*128 + szBuf>], loadE1;
+--:-:-:-:1      STS [writeS + 4x< 2*128 + szBuf>], loadE2;
+--:-:-:-:1      STS [writeS + 4x< 3*128 + szBuf>], loadE3;
+
+--:-:-:-:0      PSETP.AND.AND P6, PT, P1, P6, PT;
+
+08:-:-:-:1      STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4;
+--:-:-:-:1      STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5;
+--:-:-:-:1      STS [writeS + 4x<10*128 + szBuf + 16>], loadE6;
+--:1:-:-:1      STS [writeS + 4x<11*128 + szBuf + 16>], loadE7;
+
+--:-:-:-:1      IADD.X trackI1, trackI1, RZ;
+
+--:-:-:-:1      IADD   trackE0.CC, trackE0, 4x<16>;
+
+--:-:-:-:1      IADD readEs,  readEs, -swapBuf;
+--:-:-:-:0      IADD readIs,  readIs, -swapBuf;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackE1, trackE1, RZ;
+
+--:-:2:-:1  @P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>];
+--:5:2:-:1  @P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>];
+--:-:3:-:1  @P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>];
+--:6:3:-:1  @P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>];
+
+10:-:-:-:6  @P2 IADD   trackI0.CC, trackI0, 4x<16>;
+--:-:-:-:1  @P2 IADD.X trackI1, trackI1, RZ;
+20:-:-:-:6  @P3 IADD   trackE0.CC, trackE0, 4x<16>;
+--:-:-:-:0  @P3 IADD.X trackE1, trackE1, RZ;
+
+--:-:-:Y:5  @P5 BRA.U NEXT_Q;
+--:-:-:Y:5  @P6 BRA.U NEXT_P;
+
+--:-:-:-:2      ISETP.LT.AND P5, PT, q, param_Q, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, p, param_P, PT;
+
+NEXT_PQ:
+
+--:-:1:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*128 + 64>];
+--:-:1:-:2      LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>];
+
+
+// P0 loop N
+// P2 bounds I
+// P3 bounds E
+// P4 bounds yz
+// P5 loop Q
+// P6 loop P
+
+//loop = N >= 16 && (N >= 32 || (!p5 && !p6))
+
+NEXT_16N:
+
+<CODE>
+
+    my %insert =
+    (
+        j0c8   => "--:-:-:-:1      IADD loopN, loopN, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, loopN, 16, PT;\n",
+
+        j4c8   => "02:-:-:-:1  \@P0 STS [writeS + 4x< 0*128>], loadI0;\n",
+        j4c10  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 1*128>], loadI1;\n",
+        j4c12  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 2*128>], loadI2;\n",
+        j4c14  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 3*128>], loadI3;\n",
+
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeS + 4x< 8*128 + 16>], loadI4;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 9*128 + 16>], loadI5;\n",
+        j5c12  => "--:-:-:-:1  \@P0 STS [writeS + 4x<10*128 + 16>], loadI6;\n",
+        j5c14  => "--:2:-:-:1  \@P0 STS [writeS + 4x<11*128 + 16>], loadI7;\n",
+
+        j5c16  => "--:-:-:-:1      ISETP.GE.AND P2, PT, loopN, 32, P2;\n",
+
+        j5c60  => "02:-:2:-:1  \@P2 LDG.E.CI.128 loadI0, [trackI + 4x<0>];\n",
+        j5c62  => "--:4:2:-:1  \@P2 LDG.E.CI.128 loadI4, [trackI + 4x<8>];\n",
+
+        j6c16  => "--:-:-:-:1 \@!P2 LDS.U.128 loadI0, [addr_zero];\n",
+        j7c16  => "--:-:-:-:1 \@!P2 LDS.U.128 loadI4, [addr_zero];\n",
+
+        j10c57 => "08:-:-:-:1  \@P2 IADD   trackI0.CC, trackI0, 4x<16>;\n",
+        j10c62 => "--:-:-:-:1  \@P2 IADD.X trackI1,    trackI1, RZ;\n",
+
+        j12c8  => "04:-:-:-:1  \@P0 STS [writeS + 4x< 0*128 + szBuf>], loadE0;\n",
+        j12c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 1*128 + szBuf>], loadE1;\n",
+        j12c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 2*128 + szBuf>], loadE2;\n",
+        j12c14 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 3*128 + szBuf>], loadE3;\n",
+
+        j13c8  => "--:-:-:-:1  \@P0 STS [writeS + 4x< 8*128 + szBuf + 16>], loadE4;\n",
+        j13c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 9*128 + szBuf + 16>], loadE5;\n",
+        j13c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x<10*128 + szBuf + 16>], loadE6;\n",
+        j13c14 => "--:3:-:-:1  \@P0 STS [writeS + 4x<11*128 + szBuf + 16>], loadE7;\n",
+
+        j13c16 => "--:-:-:-:1      ISETP.GE.AND P3, PT, loopN, 32, P3;\n",
+
+        j13c60 => "04:-:3:-:1  \@P3 LDG.E.CI.128 loadE0, [trackE + 4x<0>];\n",
+        j13c62 => "--:4:3:-:1  \@P3 LDG.E.CI.128 loadE4, [trackE + 4x<8>];\n",
+
+        j14c16 => "--:-:-:-:1 \@!P3 LDS.U.128 loadE0, [addr_zero];\n",
+        j15c16 => "--:-:-:-:1 \@!P3 LDS.U.128 loadE4, [addr_zero];\n",
+
+        j15c57 => "08:-:-:-:1  \@P3 IADD   trackE0.CC, trackE0, 4x<16>;\n",
+        j15c62 => "--:-:-:-:1  \@P3 IADD.X trackE1,    trackE1, RZ;\n",
+
+        j14c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "20:-:-:-:1  \@P0 IADD readEs, readEs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j15c24 => "--:-:-:-:1      ISETP.GT.AND P1, PT, loopN, 32, PT;\n",
+        j15c37 => "--:-:-:-:1      PSETP.AND.OR P1, PT, !P5, !P6, P1;\n",
+        j15c50 => "--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, PT;\n",
+
+        j15c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_16N;\n" .
+                  "01:-:-:Y:5  \@P5 BRA.U NEXT_Q;\n" .
+                  "--:-:-:Y:5  \@P6 BRA.U NEXT_P;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 15)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = 1 - $odd;
+        my $rsOffset = ($j + 1) & 15;
+        my $rsPred   = $j == 15 ? '@P0' : '   ';
+        my $shift    = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2;
+        my $barrier  = $j == 14 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*128 + 64 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID2;
+--:-:2:-:1      S2R blkI,    SR_CTAID.Y;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Z;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.X;
+--:-:-:-:5      BRA.U END_CTAID2;
+CTAID2:
+--:-:2:-:1      S2R blkI,    SR_CTAID.X;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Y;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.Z;
+END_CTAID2:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readEs,  readEs, -4x<szBuf>;
+--:-:-:-:1  @P0 IADD readIs,  readIs, -swapBuf;
+--:-:-:-:1  @P0 IADD readEs,  readEs, -swapBuf;
+
+// writeCs = (readIs / 4) * 128 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 5;
+
+--:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      LOP.AND tid96,  tid,  96;
+01:-:-:-:1      LOP.AND t128,   tid, 128;
+
+// kk = tid31 | (t128 >> 2);
+--:-:-:-:1      SHR.U32  kk, t128, 2;
+--:-:-:-:1      LOP.OR   kk, tid31,  kk;
+
+// readCs = ((tid96 << 4) | kk) << 2;
+--:-:-:-:1      SHL      readCs, tid96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, kk;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// kk += blkE*128;
+04:-:-:-:1      ISCADD kk, blkE, kk, 7;
+--:-:-:-:1      IADD   kk, kk, param_offset_K;
+
+// crst = blkI*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 crst00, tid96, 1;
+02:-:-:-:1      ISCADD  crst00, blkI, crst00, 7;
+--:-:-:-:1      IADD    crst04, crst00,  4;
+--:-:-:-:1      IADD    crst08, crst00,  8;
+--:-:-:-:1      IADD    crst12, crst00,  12;
+
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1, K, 2;
+--:-:-:-:1      SHL K4, K, 4;
+--:-:-:-:1      ISCADD K60, K, -K4, 8;
+
+// trackF += crst*K + k;
+--:-:-:-:1      VMAD.U16.U16 tf, crst00, K, kk;
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSTK, param_CRSTK;
+08:-:-:-:1      XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ;
+        };
+    }
+    return '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0],     0x2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 0x2;
+
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P5, PT, kk, param_K, PT;
+--:-:-:-:1      IADD kk, kk, 64;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, PT;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   track04F0.CC, track00F0, K4;
+--:-:-:-:1      IADD.X track04F1,    track00F1, RZ;
+--:-:-:-:6      IADD   track08F0.CC, track04F0, K4;
+--:-:-:-:1      IADD.X track08F1,    track04F1, RZ;
+--:-:-:-:6      IADD   track12F0.CC, track08F0, K4;
+--:-:-:-:0      IADD.X track12F1,    track08F1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   track00F0.CC, track00F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst00,       crst00,     60;\n" .
+            "--:-:-:-:1      IADD.X track00F1,    track00F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track04F0.CC, track04F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst04,       crst04,     60;\n" .
+            "--:-:-:-:1      IADD.X track04F1,    track04F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track08F0.CC, track08F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst08,       crst08,     60;\n" .
+            "--:-:-:-:1      IADD.X track08F1,    track08F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track12F0.CC, track12F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst12,       crst12,     60;\n" .
+            "--:-:-:-:1      IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
+--:-:-:-:1      IADD         crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
+--:-:-:-:1      IADD         crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
+--:-:-:-:1      IADD         crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
+--:-:-:-:0      IADD         crst12, crst12, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], c4;
+
+--:-:1:-:1      LDS c0, [readCs + 4x<0*128 + 00>];
+--:-:2:-:1      LDS c2, [readCs + 4x<1*128 + 00>];
+--:-:3:-:1      LDS c4, [readCs + 4x<2*128 + 00>];
+--:-:4:-:a      LDS c6, [readCs + 4x<3*128 + 00>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:-:-:-:1  @P0 STG.E.CG [track00F], c0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 STG.E.CG [track04F], c2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 STG.E.CG [track08F], c4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 STG.E.CG [track12F], c6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
+    else
+    {
+        return q{
+01:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F], c0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F], c2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F], c4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F], c6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
++]
+
+--:-:1:-:1      LDS c1, [readCs + 4x<0*128 + 64>];
+--:-:2:-:1      LDS c3, [readCs + 4x<1*128 + 64>];
+--:-:3:-:1      LDS c5, [readCs + 4x<2*128 + 64>];
+--:-:4:-:a      LDS c7, [readCs + 4x<3*128 + 64>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:1:-:-:1  @P0 STG.E.CG [track00F + 4x<64>], c1;
+02:2:-:-:1  @P1 STG.E.CG [track04F + 4x<64>], c3;
+04:3:-:-:1  @P2 STG.E.CG [track08F + 4x<64>], c5;
+08:4:-:-:1  @P3 STG.E.CG [track12F + 4x<64>], c7;
+        };
+    }
+    else
+    {
+        return q{
+01:1:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<64>], c1;
+02:2:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<64>], c3;
+04:3:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<64>], c5;
+08:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<64>], c7;
+        };
+    }
++]
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Pascal/sconv_updat_C128_K64.sass b/Kernel/Convolution/Pascal/sconv_updat_C128_K64.sass
new file mode 100644
index 0000000..26cc64c
--- /dev/null
+++ b/Kernel/Convolution/Pascal/sconv_updat_C128_K64.sass
@@ -0,0 +1,818 @@
+# Kernel: sconv_updat_C128_K64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + (64*16 + 32)*2>
+    szShareI  : (128*16 + 32)
+    szShareE  : (64*16  + 32)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_offset_K     : c[0x0][0x15c]
+    param_N            : c[0x0][0x160]
+    param_K            : c[0x0][0x164]
+    param_D            : c[0x0][0x168]
+    param_H            : c[0x0][0x16c]
+    param_W            : c[0x0][0x170]
+    param_WN           : c[0x0][0x174]
+    param_HWN          : c[0x0][0x178]
+    param_DHWN         : c[0x0][0x17c]
+    param_C            : c[0x0][0x180]
+    param_CRST         : c[0x0][0x184]
+    param_RST          : c[0x0][0x188]
+    param_magic_RST    : c[0x0][0x18c]
+    param_shift_RST    : c[0x0][0x190]
+    param_RS           : c[0x0][0x194]
+    param_magic_RS     : c[0x0][0x198]
+    param_shift_RS     : c[0x0][0x19c]
+    param_S            : c[0x0][0x1a0]
+    param_magic_S      : c[0x0][0x1a4]
+    param_shift_S      : c[0x0][0x1a8]
+    param_pad_d        : c[0x0][0x1ac]
+    param_pad_h        : c[0x0][0x1b0]
+    param_pad_w        : c[0x0][0x1b4]
+    param_str_d        : c[0x0][0x1b8]
+    param_str_h        : c[0x0][0x1bc]
+    param_str_w        : c[0x0][0x1c0]
+    param_dil_d        : c[0x0][0x1c4]
+    param_dil_h        : c[0x0][0x1c8]
+    param_dil_w        : c[0x0][0x1cc]
+    param_P            : c[0x0][0x1d0]
+    param_Q            : c[0x0][0x1d4]
+    param_PQ           : c[0x0][0x1d8]
+    param_QN           : c[0x0][0x1dc]
+    param_PQN          : c[0x0][0x1e0]
+    param_MPQN         : c[0x0][0x1e4]
+    param_magic_Q      : c[0x0][0x1e8]
+    param_shift_Q      : c[0x0][0x1ec]
+    param_magic_PQ     : c[0x0][0x1f0]
+    param_shift_PQ     : c[0x0][0x1f4]
+    param_grid_P       : c[0x0][0x1f8]
+    param_grid_Q       : c[0x0][0x1fc]
+    param_grid_PQ      : c[0x0][0x200]
+    param_CRSTK        : c[0x0][0x204]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-67   ~ tid, blkI, blkE, one
+    68-99   ~ blkMPQ, tidX, tid1, shiftX, magicPQ, magicQ, negQ, negPQ, pq, div1, div2, div3
+
+    64-72   ~ c<0-1>, z<0-1>, y<0-1>, x<0-1>, Q
+    73-99   ~ mt, pr, qs, r<0-1>, s<0-1>, t<0-1>, rst<0-1>, rs<0-1>
+    73-99   ~ te, ti<0-1>, xw<0-1>, xW<0-1>, yh<0-1>, yH<0-1>, zd<0-1>, zD<0-1>, cC<0-1>, nextP, nextQ
+
+    64-79   : j0Ex<0-7>, j0Iy<0-7>
+    80-95   : j1Ex<0-7>, j1Iy<0-7>
+
+    100-147 : load0I<00-15>, load1I<00-15>, loadE<00-15>
+    148-153 : track0I<0-1>,  track1I<0-1>,  trackE<0-1>
+
+    154-164 ~ writeIs, writeEs, loopN, m, p, q, qq, k, crst<0-1>, tidY
+    165-167 ~ readIs, readEs, swapBuf
+
+     68-83  : f<0-7>, track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+     84-164 ~ K, K4, K1, K60, tid31, tid96, kk, tf, writeCs, readCs, crst<00|04|08|12>, alpha, blk_MPQ, CRSTK, xmad_determ
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID1;
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.X;
+--:-:3:-:1      S2R blkI,   SR_CTAID.Y;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Z;
+--:-:-:-:5      BRA.U END_CTAID1;
+CTAID1:
+--:-:2:-:1      S2R blkMPQ, SR_CTAID.Z;
+--:-:3:-:1      S2R blkI,   SR_CTAID.X;
+--:-:4:-:1      S2R blkE,   SR_CTAID.Y;
+END_CTAID1:
+
+<SCHEDULE_BLOCK>
+// tidX   = tid >> 1
+// tidY   = (tid & 1) << 2
+// shiftX = (tid & 1) << 4
+01:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      SHR.U32 tidX,   tid,  1;
+--:-:-:-:1      SHL     tidY,   tid1, 2;
+--:-:-:-:1      SHL     shiftX, tid1, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      MOV  magicPQ,    param_magic_PQ;
+--:-:-:-:1      MOV  magicQ,     param_magic_Q;
+--:-:-:-:1      IADD negQ,  RZ, -param_grid_Q;
+--:-:-:-:1      IADD negPQ, RZ, -param_grid_PQ;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magicPQ, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magicQ,  1, PT;
+
+// m = blkMPQ / PQ
+02:-:-:-:1  @P1 XMAD     div1, blkMPQ,    magicPQ,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blkMPQ,    magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blkMPQ.H1, magicPQ.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blkMPQ.H1, magicPQ,    div1;
+--:-:-:-:1  @P1 IADD3.RS m, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  m, m,      param_shift_PQ;
+--:-:-:-:1 @!P1 SHR.U32  m, blkMPQ, param_shift_PQ;
+
+// pq = blkMPQ % PQ
+--:-:-:-:1      XMAD.LO2 pq, negPQ, m, blkMPQ;
+
+// p = blockPQ / Q
+--:-:-:-:1  @P2 XMAD     div1, pq,    magicQ,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, pq,    magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, pq.H1, magicQ.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, pq.H1, magicQ,    div1;
+--:-:-:-:1  @P2 IADD3.RS p, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  p, p,  param_shift_Q;
+--:-:-:-:1 @!P2 SHR.U32  p, pq, param_shift_Q;
+
+// q = blockPQ % Q
+--:-:-:-:1      XMAD.S16.S16 q, negQ, p, pq;
+--:-:-:-:1      MOV qq, q;
+
+// writeIs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeIs, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeIs, writeIs, shiftX;
+--:-:-:-:1      ISCADD writeIs, writeIs, 4x<szShareI + szShareE>, 2;
+
+// writeEs = (tidY*64 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeEs, tidY, tidX, 6;
+--:-:-:-:1      IADD   writeEs, writeEs, shiftX;
+--:-:-:-:1      ISCADD writeEs, writeEs, 4x<szShareI*2 + szShareE>, 2;
+
+// readIs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readIs, tid,   -16;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readEs, readEs, 4x<szShareI>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareI + szShareE>;
+
+// crst = blockI*128 + tid
+04:-:-:-:1      ISCADD crst0, blkI, tidX, 7;
+--:-:-:-:1      IADD   crst1, crst0, 64;
+
+// k = blockE*64 + tid
+08:-:-:-:1      ISCADD k, blkE, tidX, 6;
+--:-:-:-:1      IADD   k, k, param_offset_K;
+
+--:-:-:-:1      MOV loopN, RZ;
+
+--:-:-:-:1      PSETP.AND.AND P0, PT, PT, PT, PT;
+</SCHEDULE_BLOCK>
+
+NEXT_PQ:
+
+<SCHEDULE_BLOCK>
+// Zigzag q but only if grid_P < P
+--:-:-:-:1      LOP.AND.NZ P1, RZ, p, 1;
+--:-:-:-:1      MOV Q, param_grid_P;
+--:-:-:-:1      ISETP.LT.AND P1, PT, Q, param_P, P1;
+--:-:-:-:1      MOV Q, -1;
+--:-:-:-:1  @P1 IADD3 Q, -q, param_Q, Q;
+--:-:-:-:1 @!P1 MOV Q, q;
+// c   = crst / RST
+// rst = crst % RST
+--:-:-:-:1      XMAD.LO2C  c0, crst0, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32    c0, c0, param_shift_RST;
+--:-:-:-:1      XMAD rst0, c0, param_RST, RZ;
+--:-:-:-:1      IADD rst0, -rst0, crst0;
+--:-:-:-:1      XMAD.LO2C  c1, crst1, param_magic_RST, RZ;
+--:-:-:-:1      SHR.U32    c1, c1, param_shift_RST;
+--:-:-:-:1      XMAD rst1, c1, param_RST, RZ;
+--:-:-:-:1      IADD rst1, -rst1, crst1;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C  t0, rst0, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32    t0, t0, param_shift_RS;
+--:-:-:-:1      XMAD  rs0, t0, param_RS, RZ;
+--:-:-:-:1      IADD  rs0, -rs0, rst0;
+--:-:-:-:1      XMAD.LO2C  t1, rst1, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32    t1, t1, param_shift_RS;
+--:-:-:-:1      XMAD  rs1, t1, param_RS, RZ;
+--:-:-:-:1      IADD  rs1, -rs1, rst1;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C  r0, rs0, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32    r0, r0, param_shift_S;
+--:-:-:-:1      XMAD   s0, r0, param_S, RZ;
+--:-:-:-:1      IADD   s0, -s0, rs0;
+--:-:-:-:1      XMAD.LO2C  r1, rs1, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32    r1, r1, param_shift_S;
+--:-:-:-:1      XMAD   s1, r1, param_S, RZ;
+--:-:-:-:1      IADD   s1, -s1, rs1;
+// z = m * w - pad_d + (t * dil_d)
+// y = p * u - pad_h + (r * dil_h)
+// x = q * v - pad_w + (s * dil_w)
+--:-:-:-:1      XMAD  mt, m,   param_str_d, RZ;
+--:-:-:-:1      XMAD  pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD  qs, Q,   param_str_w, RZ;
+--:-:-:-:1      XMAD  z1, t1,  param_dil_d, mt;
+--:-:-:-:1      XMAD  y1, r1,  param_dil_h, pr;
+--:-:-:-:1      XMAD  x1, s1,  param_dil_w, qs;
+--:-:-:-:1      XMAD  z0, t0,  param_dil_d, mt;
+--:-:-:-:1      XMAD  y1, r0,  param_dil_h, pr;
+--:-:-:-:1      XMAD  x1, s0,  param_str_w, qs;
+--:-:-:-:1      IADD  z1, z1, -param_pad_d;
+--:-:-:-:1      IADD  y1, y1, -param_pad_h;
+--:-:-:-:1      IADD  x1, x1, -param_pad_w;
+--:-:-:-:1      IADD  z0, z0, -param_pad_d;
+--:-:-:-:1      IADD  y0, y0, -param_pad_h;
+--:-:-:-:1      IADD  x0, x0, -param_pad_w;
+</SCHEDULE_BLOCK>
+
+// Split blocks to fit inside of 36 registers
+<SCHEDULE_BLOCK>
+// trackI = c*DHWN + z*HWN + y*WN + x*N + tidY
+--:-:-:-:1      XMAD.LO2C ti0, c0, param_DHWN, tidY;
+--:-:-:-:1      XMAD.LO2C ti0, z0, param_HWN,  ti0;
+--:-:-:-:1      XMAD.LO2C ti0, y0, param_WN,   ti0;
+--:-:-:-:1      XMAD      ti0, x0, param_N,    ti0;
+--:-:-:-:1      XMAD.LO2C ti1, c1, param_DHWN, tidY;
+--:-:-:-:1      XMAD.LO2C ti1, z1, param_HWN,  ti1;
+--:-:-:-:1      XMAD.LO2C ti1, y1, param_WN,   ti1;
+--:-:-:-:1      XMAD      ti1, x1, param_N,    ti1;
+--:-:-:-:1      LEA      track0I0.CC, ti0, param_I[0],     2;
+--:-:-:-:1      LEA.HI.X track0I1,    ti0, param_I[1], RZ, 2;
+--:-:-:-:1      LEA      track1I0.CC, ti1, param_I[0],     2;
+--:-:-:-:1      LEA.HI.X track1I1,    ti1, param_I[1], RZ, 2;
+
+// trackE = k*MPQN + m*PQN + p*QN + tidY
+--:-:-:-:1      XMAD.LO2C te, k, param_MPQN, tidY;
+--:-:-:-:1      XMAD.LO2C te, m, param_PQN,  te;
+--:-:-:-:1      XMAD.LO2C te, p, param_QN,   te;
+--:-:-:-:1      XMAD      te, Q, param_N,    te;
+--:-:-:-:1      LEA      trackE0.CC, te, param_E[0],     2;
+--:-:-:-:0      LEA.HI.X trackE1,    te, param_E[1], RZ, 2;
+
+// Bounds check x,y,z,c for each I track.
+// If out of bounds, this will set the track address to -1
+--:-:-:-:1      ISET.GE.AND cC0, c0, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd0, z0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD0, z0, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh0, y0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH0, y0, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw0, x0, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW0, x0, param_W, PT;
+--:-:-:-:1      LOP.OR   track0I0, track0I0, cC0;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, zd0, zD0, 0xfe;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, yh0, yH0, 0xfe;
+--:-:-:-:1      LOP3.LUT track0I0, track0I0, xw0, xW0, 0xfe;
+
+--:-:-:-:1      ISET.GE.AND cC1, c1, param_C, PT;
+--:-:-:-:1      ISET.LT.AND zd1, z1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND zD1, z1, param_D, PT;
+--:-:-:-:1      ISET.LT.AND yh1, y1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND yH1, y1, param_H, PT;
+--:-:-:-:1      ISET.LT.AND xw1, x1, RZ, PT;
+--:-:-:-:1      ISET.GE.AND xW1, x1, param_W, PT;
+--:-:-:-:1      LOP.OR   track1I0, track1I0, cC1;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, zd1, zD1, 0xfe;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, yh1, yH1, 0xfe;
+--:-:-:-:1      LOP3.LUT track1I0, track1I0, xw1, xW1, 0xfe;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, PT;
+--:-:-:-:1      ISETP.NE.AND P3, PT, track1I0, -1, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, k, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:-:1      IADD loopN, loopN, param_N;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P0 BRA.U FIRST_LOAD;
+
+INIT_LOOP:
+
+--:-:1:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64  + 00>];
+--:-:1:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64  + 32>];
+--:-:1:-:2      LDS.U.128 j0Iy4, [readIs + 4x<0*128 + 64>];
+
+NEXT_16N:
+
+<CODE>
+
+    my %insert =
+    (
+        j0c8   => "--:-:-:-:1      IADD loopN, loopN, -16;\n",
+
+        # p0 = (N & 16) == 0
+        # p1 = N >= 32 && p0
+        j0c14  => "--:-:-:-:1      LOP.AND.NZ P0, RZ, loopN, 16;\n",
+        j0c28  => "--:-:-:-:1      ISETP.GE.AND P1, PT, loopN, 32, P0;\n",
+
+
+        j1c8   => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 0*128 +  0 +  0>], load0I08;\n",
+        j1c10  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 1*128 +  0 +  0>], load0I09;\n",
+        j1c12  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 2*128 +  0 +  0>], load0I10;\n",
+        j1c14  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 3*128 +  0 +  0>], load0I11;\n",
+        j1c16  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 8*128 +  0 + 16>], load0I12;\n",
+        j1c18  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 9*128 +  0 + 16>], load0I13;\n",
+        j1c20  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<10*128 +  0 + 16>], load0I14;\n",
+        j1c22  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<11*128 +  0 + 16>], load0I15;\n",
+
+        j2c8   => "02:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 +  0 +  0>], load0I00;\n",
+        j2c10  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 +  0 +  0>], load0I01;\n",
+        j2c12  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 +  0 +  0>], load0I02;\n",
+        j2c14  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 +  0 +  0>], load0I03;\n",
+        j2c16  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 +  0 + 16>], load0I04;\n",
+        j2c18  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 +  0 + 16>], load0I05;\n",
+        j2c20  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 +  0 + 16>], load0I06;\n",
+        j2c22  => "--:2:-:-:1 \@!P0 STS [writeIs + 4x<11*128 +  0 + 16>], load0I07;\n",
+
+        j2c24  => "--:-:-:-:1      ISETP.NE.AND P2, PT, track0I0, -1, P1;\n",
+        j2c26  => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track0I0, -1, P1;\n",
+
+        j3c8   => "02:-:-:-:1  \@P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>];\n",
+        j3c10  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>];\n",
+        j3c12  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>];\n",
+        j3c14  => "--:5:2:-:1  \@P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>];\n",
+
+        j4c8   => "--:-:-:-:1  \@P3 LDS.U.128 load0I00, [addr_zero];\n",
+        j4c10  => "--:-:-:-:1  \@P3 LDS.U.128 load0I04, [addr_zero];\n",
+        j5c8   => "--:-:-:-:1  \@P3 LDS.U.128 load0I08, [addr_zero];\n",
+        j5c10  => "--:-:-:-:1  \@P3 LDS.U.128 load0I12, [addr_zero];\n",
+
+        j5c57  => "10:-:-:-:1  \@P2 IADD   track0I0.CC, track0I0, 4x<32>;\n",
+        j5c62  => "--:-:-:-:1  \@P2 IADD.X track0I1,    track0I1, RZ;\n",
+
+        j6c8   => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 0*128 + 64 +  0>], load1I08;\n",
+        j6c10  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 1*128 + 64 +  0>], load1I09;\n",
+        j6c12  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 2*128 + 64 +  0>], load1I10;\n",
+        j6c14  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 3*128 + 64 +  0>], load1I11;\n",
+        j6c16  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I12;\n",
+        j6c18  => "--:-:-:-:1  \@P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I13;\n",
+        j6c20  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I14;\n",
+        j6c22  => "--:-:-:-:1  \@P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I15;\n",
+
+        j7c8   => "04:-:-:-:1 \@!P0 STS [writeIs + 4x< 0*128 + 64 +  0>], load1I00;\n",
+        j7c10  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 1*128 + 64 +  0>], load1I01;\n",
+        j7c12  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 2*128 + 64 +  0>], load1I02;\n",
+        j7c14  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 3*128 + 64 +  0>], load1I03;\n",
+        j7c16  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04;\n",
+        j7c18  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05;\n",
+        j7c20  => "--:-:-:-:1 \@!P0 STS [writeIs + 4x<10*128 + 64 + 16>], load1I06;\n",
+        j7c22  => "--:3:-:-:1 \@!P0 STS [writeIs + 4x<11*128 + 64 + 16>], load1I07;\n",
+
+        j7c24  => "--:-:-:-:1      ISETP.NE.AND P2, PT, track1I0, -1, P1;\n",
+        j7c26  => "--:-:-:-:1      ISETP.EQ.AND P3, PT, track1I0, -1, P1;\n",
+
+        j8c8   => "04:-:-:-:1  \@P2 LDG.E.CI.128 load1I00, [track1I + 4x< 0>];\n",
+        j8c10  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load1I04, [track1I + 4x< 8>];\n",
+        j8c12  => "--:-:-:-:1  \@P2 LDG.E.CI.128 load1I08, [track1I + 4x<16>];\n",
+        j8c14  => "--:5:3:-:1  \@P2 LDG.E.CI.128 load1I12, [track1I + 4x<24>];\n",
+
+        j9c8   => "--:-:-:-:1  \@P3 LDS.U.128 load1I00, [addr_zero];\n",
+        j9c10  => "--:-:-:-:1  \@P3 LDS.U.128 load1I04, [addr_zero];\n",
+        j10c8  => "--:-:-:-:1  \@P3 LDS.U.128 load1I08, [addr_zero];\n",
+        j10c10 => "--:-:-:-:1  \@P3 LDS.U.128 load1I12, [addr_zero];\n",
+
+        j10c57 => "10:-:-:-:1  \@P2 IADD   track1I0.CC, track1I0, 4x<32>;\n",
+        j10c62 => "--:-:-:-:1  \@P2 IADD.X track1I1,    track1I1, RZ;\n",
+
+
+        j11c8   => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 0*64 +  0>], loadE08;\n",
+        j11c10  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 1*64 +  0>], loadE09;\n",
+        j11c12  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 2*64 +  0>], loadE10;\n",
+        j11c14  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 3*64 +  0>], loadE11;\n",
+        j11c16  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 8*64 + 16>], loadE12;\n",
+        j11c18  => "--:-:-:-:1  \@P0 STS [writeEs + 4x< 9*64 + 16>], loadE13;\n",
+        j11c20  => "--:-:-:-:1  \@P0 STS [writeEs + 4x<10*64 + 16>], loadE14;\n",
+        j11c22  => "--:-:-:-:1  \@P0 STS [writeEs + 4x<11*64 + 16>], loadE15;\n",
+
+        j12c8   => "08:-:-:-:1 \@!P0 STS [writeEs + 4x< 0*64 +  0>], loadE00;\n",
+        j12c10  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 1*64 +  0>], loadE01;\n",
+        j12c12  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 2*64 +  0>], loadE02;\n",
+        j12c14  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 3*64 +  0>], loadE03;\n",
+        j12c16  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 8*64 + 16>], loadE04;\n",
+        j12c18  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x< 9*64 + 16>], loadE05;\n",
+        j12c20  => "--:-:-:-:1 \@!P0 STS [writeEs + 4x<10*64 + 16>], loadE06;\n",
+        j12c22  => "--:4:-:-:1 \@!P0 STS [writeEs + 4x<11*64 + 16>], loadE07;\n",
+
+        j12c24  => "--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K,  P1;\n",
+
+        j13c8   => "08:-:-:-:1  \@P2 LDG.E.CI.128 loadE00, [trackE + 4x< 0>];\n",
+        j13c10  => "--:-:-:-:1  \@P2 LDG.E.CI.128 loadE04, [trackE + 4x< 8>];\n",
+        j13c12  => "--:-:-:-:1  \@P2 LDG.E.CI.128 loadE08, [trackE + 4x<16>];\n",
+        j13c14  => "--:5:4:-:1  \@P2 LDG.E.CI.128 loadE12, [trackE + 4x<24>];\n",
+
+        j15c57  => "10:-:-:-:1  \@P2 IADD   trackE0.CC, trackE0, 4x<32>;\n",
+        j15c62  => "--:-:-:-:1  \@P2 IADD.X trackE1,    trackE1, RZ;\n",
+
+        # p0 = N >= 16 and not (N == 32 and (p or q))
+        j14c8   => "--:-:-:-:1      ISETP.EQ.AND  P0, PT, loopN, 32, PT;\n",
+        j14c10  => "--:-:-:-:1      ISETP.GE.AND  P1, PT, loopN, 16, PT;\n",
+        j14c22  => "--:-:-:-:1      PSETP.OR.AND  P0, PT, P5, P6, P0;\n",
+        j14c35  => "--:-:-:-:1      PSETP.AND.AND P0, PT, !P0, P1, PT;\n",
+
+        j14c63 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "20:-:-:-:1      IADD readEs,  readEs, -swapBuf;\n" .
+                  "--:-:-:-:1      IADD readIs,  readIs, -swapBuf;\n" .
+                  "--:-:-:-:1      IADD writeEs, writeEs, swapBuf;\n" .
+                  "--:-:-:-:1      IADD writeIs, writeIs, swapBuf;\n" .
+                  "--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j15c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_16N;\n" .
+                  "--:-:-:-:0  \@P5 IADD q, q, param_grid_Q;\n" .
+                  "01:-:-:Y:5  \@P5 BRA.U NEXT_PQ;\n" .
+                  "--:-:-:-:1  \@P6 MOV  q, qq;\n" .
+                  "--:-:-:-:0  \@P6 IADD p, p, param_grid_P;\n" .
+                  "--:-:-:Y:5  \@P6 BRA.U NEXT_PQ;\n" .
+                  "--:-:-:Y:5      BRA.U FINISH;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out;
+    foreach my $j (0 .. 15)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = 1 - $odd;
+        my $rsOffset = ($j + 1) & 15;
+        my $rsPred   = $j == 15 ? '@P0' : '   ';
+        my $shift    = $rsOffset < 4 ? 0 : $rsOffset < 12 ? 1 : 2;
+        my $barrier  = $j == 14 ? '6' : '-';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64  + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dIy0, [readIs + 4x<%d*128 + 00 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64  + 32 + %d*16>];\n", $rsPred, $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:%s:1:-:1  %s LDS.U.128 j%dIy4, [readIs + 4x<%d*128 + 64 + %d*16>];\n", $barrier, $rsPred, $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+FIRST_LOAD:
+
+--:-:-:-:0      PSETP.AND.AND P0, PT, PT, PT, !PT;
+
+--:-:-:-:1  @P2 LDG.E.CI.128 load0I00, [track0I + 4x< 0>];
+--:-:-:-:1  @P2 LDG.E.CI.128 load0I04, [track0I + 4x< 8>];
+--:-:-:-:1  @P2 LDG.E.CI.128 load0I08, [track0I + 4x<16>];
+--:-:1:-:1  @P2 LDG.E.CI.128 load0I12, [track0I + 4x<24>];
+--:-:-:-:1 @!P2 LDS.U.128    load0I00, [addr_zero];
+--:-:-:-:1 @!P2 LDS.U.128    load0I04, [addr_zero];
+--:-:-:-:1 @!P2 LDS.U.128    load0I08, [addr_zero];
+--:-:4:-:1 @!P2 LDS.U.128    load0I12, [addr_zero];
+
+// p1 = N == 32 and (p or q)
+--:-:-:-:0      ISETP.EQ.AND  P1, PT, loopN, 32, PT;
+
+--:-:-:-:1  @P3 LDG.E.CI.128 load1I00, [track1I + 4x< 0>];
+--:-:-:-:1  @P3 LDG.E.CI.128 load1I04, [track1I + 4x< 8>];
+--:-:-:-:1  @P3 LDG.E.CI.128 load1I08, [track1I + 4x<16>];
+--:-:2:-:1  @P3 LDG.E.CI.128 load1I12, [track1I + 4x<24>];
+--:-:-:-:1 @!P3 LDS.U.128    load1I00, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.128    load1I04, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.128    load1I08, [addr_zero];
+--:-:5:-:1 @!P3 LDS.U.128    load1I12, [addr_zero];
+
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE00, [trackE + 4x< 0>];
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE04, [trackE + 4x< 8>];
+--:-:-:-:1  @P4 LDG.E.CI.128 loadE08, [trackE + 4x<16>];
+--:-:3:-:1  @P4 LDG.E.CI.128 loadE12, [trackE + 4x<24>];
+--:-:-:-:1 @!P4 LDS.U.128    loadE00, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128    loadE04, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128    loadE08, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128    loadE12, [addr_zero];
+
+--:-:-:-:0      PSETP.OR.AND  P1, PT, P5, P6, P1;
+
+09:-:-:-:1      STS [writeIs + 4x< 0*128 +  0 +  0>], load0I00;
+--:-:-:-:1      STS [writeIs + 4x< 1*128 +  0 +  0>], load0I01;
+--:-:-:-:1      STS [writeIs + 4x< 2*128 +  0 +  0>], load0I02;
+--:-:-:-:1      STS [writeIs + 4x< 3*128 +  0 +  0>], load0I03;
+--:-:-:-:1      STS [writeIs + 4x< 8*128 +  0 + 16>], load0I04;
+--:-:-:-:1      STS [writeIs + 4x< 9*128 +  0 + 16>], load0I05;
+--:-:-:-:1      STS [writeIs + 4x<10*128 +  0 + 16>], load0I06;
+--:-:-:-:1      STS [writeIs + 4x<11*128 +  0 + 16>], load0I07;
+
+--:-:-:-:6  @P2 IADD   track0I0.CC, track0I0, 4x<32>;
+--:-:-:-:0  @P2 IADD.X track0I1,    track0I1, RZ;
+
+12:-:-:-:1      STS [writeIs + 4x< 0*128 + 64 +  0>], load1I00;
+--:-:-:-:1      STS [writeIs + 4x< 1*128 + 64 +  0>], load1I01;
+--:-:-:-:1      STS [writeIs + 4x< 2*128 + 64 +  0>], load1I02;
+--:-:-:-:1      STS [writeIs + 4x< 3*128 + 64 +  0>], load1I03;
+--:-:-:-:1      STS [writeIs + 4x< 8*128 + 64 + 16>], load1I04;
+--:-:-:-:1      STS [writeIs + 4x< 9*128 + 64 + 16>], load1I05;
+--:-:-:-:1      STS [writeIs + 4x<10*128 + 64 + 16>], load1I06;
+--:-:-:-:1      STS [writeIs + 4x<11*128 + 64 + 16>], load1I07;
+
+--:-:-:-:3  @P3 IADD   track1I0.CC, track1I0, 4x<32>;
+--:-:-:-:2      PSETP.AND.AND P5, PT, P5, P1, PT;
+--:-:-:-:1      PSETP.AND.AND P6, PT, P6, P1, PT;
+--:-:-:-:0  @P3 IADD.X track1I1,    track1I1, RZ;
+
+24:-:-:-:1      STS [writeEs + 4x< 0*64 +  0>], loadE00;
+--:-:-:-:1      STS [writeEs + 4x< 1*64 +  0>], loadE01;
+--:-:-:-:1      STS [writeEs + 4x< 2*64 +  0>], loadE02;
+--:-:-:-:1      STS [writeEs + 4x< 3*64 +  0>], loadE03;
+--:-:-:-:1      STS [writeEs + 4x< 8*64 + 16>], loadE04;
+--:-:-:-:1      STS [writeEs + 4x< 9*64 + 16>], loadE05;
+--:-:-:-:1      STS [writeEs + 4x<10*64 + 16>], loadE06;
+--:1:-:-:1      STS [writeEs + 4x<11*64 + 16>], loadE07;
+
+--:-:-:-:6  @P4 IADD   trackE0.CC, trackE0, 4x<32>;
+--:-:-:-:1  @P4 IADD.X trackE1,    trackE1, RZ;
+
+--:-:-:-:1      IADD readEs,  readEs, -swapBuf;
+--:-:-:-:0      IADD readIs,  readIs, -swapBuf;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;
+
+--:-:-:-:1      IADD nextQ, q, param_grid_Q;
+--:-:-:-:1      IADD nextP, p, param_grid_P;
+
+--:-:-:-:0  @P5 IADD q, q, param_grid_Q;
+--:-:-:Y:5  @P5 BRA.U NEXT_PQ;
+--:-:-:-:0  @P6 IADD p, p, param_grid_P;
+--:-:-:Y:5  @P6 BRA.U NEXT_PQ;
+
+--:-:-:-:2      ISETP.LT.AND P5, PT, nextQ, param_Q, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, nextP, param_P, PT;
+
+--:-:-:Y:5      BRA.U INIT_LOOP;
+
+
+FINISH:
+
+--:-:-:-:0      MOV one, 1;
+--:-:1:-:6      S2R tid, SR_TID.X;
+--:-:-:Y:d      ISETP.EQ.AND P0, PT, one, param_RST, PT;
+--:-:-:-:5  @P0 BRA.U CTAID2;
+--:-:2:-:1      S2R blkI,    SR_CTAID.Y;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Z;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.X;
+--:-:-:-:5      BRA.U END_CTAID2;
+CTAID2:
+--:-:2:-:1      S2R blkI,    SR_CTAID.X;
+--:-:3:-:1      S2R blkE,    SR_CTAID.Y;
+--:-:4:-:1      S2R blk_MPQ, SR_CTAID.Z;
+END_CTAID2:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readEs,  readEs, -4x<szShareI>;
+--:-:-:-:1  @P0 IADD readIs,  readIs, -swapBuf;
+--:-:-:-:1  @P0 IADD readEs,  readEs, -swapBuf;
+
+// writeCs = (readIs / 4) * 64 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 4;
+
+
+// readCs = ((tid & 96) << 3) | (tid & 31)
+01:-:-:-:1      LOP.AND tid31, tid, 31;
+01:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 3;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+
+// kk = blkE*64 + tid31;
+04:-:-:-:1      ISCADD kk, blkE, tid31, 6;
+--:-:-:-:1      IADD   kk, kk, param_offset_K;
+
+
+// crst = blkI*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 crst00, tid96,  1;
+02:-:-:-:1      ISCADD  crst00, blkI,   crst00, 7;
+--:-:-:-:1      IADD    crst04, crst00, 4;
+--:-:-:-:1      IADD    crst08, crst00, 8;
+--:-:-:-:1      IADD    crst12, crst00, 12;
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1, K, 2;
+--:-:-:-:1      SHL K4, K, 4;
+--:-:-:-:1      ISCADD K60, K, -K4, 8;
+
+// trackF += crst*K + k;
+--:-:-:-:1      VMAD.U16.U16 tf, crst00, K, kk;
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSTK, param_CRSTK;
+08:-:-:-:1      XMAD.LO tf, blk_MPQ, CRSTK, tf, xmad_determ;
+        };
+    }
+    return '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0], 0x2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 0x2;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P5, PT, kk, param_K, PT;
+--:-:-:-:1      IADD kk, kk, 32;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, PT;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   track04F0.CC, track00F0, K4;
+--:-:-:-:1      IADD.X track04F1,    track00F1, RZ;
+--:-:-:-:6      IADD   track08F0.CC, track04F0, K4;
+--:-:-:-:1      IADD.X track08F1,    track04F1, RZ;
+--:-:-:-:6      IADD   track12F0.CC, track08F0, K4;
+--:-:-:-:1      IADD.X track12F1,    track08F1, RZ;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   track00F0.CC, track00F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst00,       crst00,     60;\n" .
+            "--:-:-:-:1      IADD.X track00F1,    track00F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track04F0.CC, track04F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst04,       crst04,     60;\n" .
+            "--:-:-:-:1      IADD.X track04F1,    track04F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track08F0.CC, track08F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst08,       crst08,     60;\n" .
+            "--:-:-:-:1      IADD.X track08F1,    track08F1,  RZ;\n" .
+            "--:-:-:-:5      IADD   track12F0.CC, track12F0, K60;\n" .
+            "--:-:-:-:1      IADD   crst12,       crst12,     60;\n" .
+            "--:-:-:-:1      IADD.X track12F1,    track12F1,  RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL f0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL f6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL f7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5; // crst00 < CRST && k < K
+--:-:-:-:1      IADD         crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5; // crst04 < CRST && k < K
+--:-:-:-:1      IADD         crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5; // crst08 < CRST && k < K
+--:-:-:-:1      IADD         crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5; // crst12 < CRST && k < K
+--:-:-:-:0      IADD         crst12, crst12, 1;
+
+// Warp shuffle to drop the awkward readAs/readBs mapping
+--:-:-:-:1      STS.128 [writeCs+4x<00>], f0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], f4;
+
+--:-:1:-:1      LDS f0, [readCs + 4x<0*64 + 00>];
+--:-:2:-:1      LDS f2, [readCs + 4x<1*64 + 00>];
+--:-:3:-:1      LDS f4, [readCs + 4x<2*64 + 00>];
+--:-:4:-:1      LDS f6, [readCs + 4x<3*64 + 00>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:-:-:-:1  @P0 STG.E.CG [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 STG.E.CG [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 STG.E.CG [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 STG.E.CG [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
+    else
+    {
+        return q{
+01:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F], f0;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P6, PT;
+02:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F], f2;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P1, P6, PT;
+04:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F], f4;
+--:-:-:-:1      PSETP.AND.AND P2, PT, P2, P6, PT;
+08:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F], f6;
+--:-:-:-:1      PSETP.AND.AND P3, PT, P3, P6, PT;
+        };
+    }
++]
+
+--:-:1:-:1      LDS f1, [readCs + 4x<0*64 + 32>];
+--:-:2:-:1      LDS f3, [readCs + 4x<1*64 + 32>];
+--:-:3:-:1      LDS f5, [readCs + 4x<2*64 + 32>];
+--:-:4:-:1      LDS f7, [readCs + 4x<3*64 + 32>];
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+01:1:-:-:1  @P0 STG.E.CG [track00F + 4x<32>], f1;
+02:2:-:-:1  @P1 STG.E.CG [track04F + 4x<32>], f3;
+04:3:-:-:1  @P2 STG.E.CG [track08F + 4x<32>], f5;
+08:4:-:-:1  @P3 STG.E.CG [track12F + 4x<32>], f7;
+        };
+    }
+    else
+    {
+        return q{
+01:1:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [track00F + 4x<32>], f1;
+02:2:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [track04F + 4x<32>], f3;
+04:3:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [track08F + 4x<32>], f5;
+08:4:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [track12F + 4x<32>], f7;
+        };
+    }
++]
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X128_N128.sass b/Kernel/Convolution/Pascal/sconv_xprop_X128_N128.sass
new file mode 100644
index 0000000..8f91aba
--- /dev/null
+++ b/Kernel/Convolution/Pascal/sconv_xprop_X128_N128.sass
@@ -0,0 +1,233 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 128;
+    our $shareF = 128;
+    our $stepI  = 32;
+    our $stepF  = 64;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*8*2 + 128*8*2 + 0>
+    szShareF  : (128*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<128*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<128*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<128*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<128*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<128*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<128*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<128*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<128*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne
+     72-111 ~ tid1, tid128, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-107 : loadI<0-3>,  loadF<0-3>
+
+    108-111 ~ offsetF, offsetI, offsetFc, offsetIc
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    114-122 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset
+    123-127 ~ readFs, readIs, tid, idx_N
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-122  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidX = (tid & 31) << 2
+// tidY = tid >> 5
+--:-:-:-:1      LOP.AND tidX, tid,  31;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  5;
+
+// trackF += blkF*128 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 7;
+
+// trackI += blkI*128 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 7;
+
+// writeS = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+// readFs = ((tid & 112) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    112;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 128) >> 3) | ((tid >> 1) & 7)
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      SHR.U32 tid128, tid128, 3;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid128;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF, [trackF];
+--:-:5:-:1 @!P1 LDS.U.128    loadF, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128    loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.128    loadI, [addr_zero];
+
+12:-:-:-:1      STS.128 [writeS], loadF;
+24:1:-:-:1      STS.128 [writeS + 4x<szShareF>], loadI;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF, [trackF];
+--:-:3:-:1  @P1 LDG.E.128    loadI, [trackI];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c40 => "02:2:-:-:1  \@P0 STS.128 [writeS], loadF;\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "20:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "02:-:2:-:1  \@P1 LDG.E.CI.128 loadF, [trackF];\n",
+
+
+        j6c8  => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<szShareF>], loadI;\n",
+
+        j6c54 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j6c59 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j6c61 => "04:-:3:-:1  \@P1 LDG.E.128 loadI, [trackI];\n",
+
+        j6c62 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<szShareF + szShareI>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<szShareF + szShareI>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2 + (tid & 128) >> 1
+// tidOY = (tid & 127) >> 3
+--:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      SHL     tidOX,  tidOX,  2;
+--:-:-:-:1      LOP.AND tidOX2, tid,    128;
+--:-:-:-:1      SHR.U32 tidOX2, tidOX2, 1;
+--:-:-:-:1      LOP.OR  tidOX,  tidOX,  tidOX2;
+--:-:-:-:1      LOP.AND tidOY,  tid,    127;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      LOP.AND readIs, readIs, 0x1ff;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x0ff;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+--:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*128 + tidOY * 4
+--:-:-:-:1      SHL tidOY, tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 7;
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X128_N64.sass b/Kernel/Convolution/Pascal/sconv_xprop_X128_N64.sass
new file mode 100644
index 0000000..d7bd0a1
--- /dev/null
+++ b/Kernel/Convolution/Pascal/sconv_xprop_X128_N64.sass
@@ -0,0 +1,246 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 64;
+    our $shareF = 128;
+    our $stepI  = 32;
+    our $stepF  = 64;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 128*8*2 + 0>
+    szShareF  : (128*8)
+    szShareI  : (64*8)
+
+    addr_zero  : 4x<64*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-111 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-111 : loadI<0-3>, loadF<0-7>
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    104-107 ~ offsetF, offsetIc, offsetFc
+
+    114-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI
+    125-127 ~ readFs, readIs, swapBuf
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-124  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidX = (tid & 15) << 2
+// tidY = tid >> 4
+--:-:-:-:1      LOP.AND tidX, tid,  15;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  4;
+
+// trackF += blkF*128 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 7;
+
+// trackI += blkI*64 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 6;
+
+// writeFs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// writeIs = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidX, 6;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs = ((tid & -16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = (tid >> 1) & 7
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>];
+--:-:3:-:1  @P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>];
+--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero];
+--:-:5:-:1 @!P1 LDS.U.128 loadF4, [addr_zero];
+
+--:-:4:-:1  @P1 LDG.E.128 loadI, [trackI];
+--:-:6:-:1 @!P1 LDS.U.128 loadI, [addr_zero];
+
+12:-:-:-:1      STS.128 [writeFs + 4x<00>], loadF0;
+04:-:-:-:1      STS.128 [writeFs + 4x<64>], loadF4;
+
+28:1:-:-:1      STS.128 [writeIs], loadI;
+
+[+ loop_setup() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>];
+--:5:3:-:1  @P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>];
+--:-:4:-:1  @P1 LDG.E.128 loadI, [trackI];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c40 => "02:2:-:-:1  \@P0 STS.128 [writeFs + 4x<00>], loadF0;\n",
+
+        j2c10 => "02:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "20:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "10:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF + 4x<00>];\n",
+
+        j4c8  => "04:3:-:-:1  \@P0 STS.128 [writeFs + 4x<64>], loadF4;\n",
+
+        j4c60 => "04:5:3:-:1  \@P1 LDG.E.CI.128 loadF4, [trackF + 4x<64>];\n",
+
+        j6c8  => "08:4:-:-:1  \@P0 STS.128 [writeIs], loadI;\n",
+
+        j6c55 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j6c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j6c62 => "08:5:4:-:1  \@P1 LDG.E.128 loadI, [trackI];\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "--:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2
+// tidOY = tid >> 3
+02:-:-:-:1      LOP.AND tidOX,  tid,   7;
+--:-:-:-:1      SHL     tidOX,  tidOX, 2;
+--:-:-:-:1      SHR.U32 tidOY,  tid,   3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 64 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs  = 4 * (tidOX + (tidOY * 64))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*128 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 7;
+
+[+ output_setup(63, 0, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X32_N128.sass b/Kernel/Convolution/Pascal/sconv_xprop_X32_N128.sass
new file mode 100644
index 0000000..568e714
--- /dev/null
+++ b/Kernel/Convolution/Pascal/sconv_xprop_X32_N128.sass
@@ -0,0 +1,262 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 128;
+    our $shareF = 32;
+    our $stepI  = 32;
+    our $stepF  = 16;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<32*8*2 + 128*8*2 + 0>
+    szShareF  : (32*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<32*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<32*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<32*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<32*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<32*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<32*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<32*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<32*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-69 : m, p, q
+      64-69 : idx_M, idx_P, idx_Q, idx_K, tidY, negOne
+     70-113 ~ tid1, tid32, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     70-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-119 : loadI<00-15>,  loadF<0-3>
+
+    120-121 : sliceI, sliceF
+    120-121 : sliceIF<0-1>
+
+    122-140 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI, offsetF, offsetIc, offsetFc
+    141-155 ~ readFs, readIs, swapBuf, tid, idx_N
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-140  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+
+[+ get_mpqk() +]
+
+// tidX = (tid & 7) << 2
+// tidY = tid >> 3
+--:-:-:-:1      LOP.AND tidX, tid,  7;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  3;
+
+// trackF += blkF*32 + tidX + offset_K
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 5;
+
+// trackI += blkI*128 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 7;
+
+// writeFs = (32*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidX, 5;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// writeIs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidX, 7;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+// readFs  = (((tid & 16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:0      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 32) >> 1) | ((tid >> 1) & 7) << 4
+--:-:-:-:1      LOP.AND tid32, tid,  32;
+--:-:-:-:1      SHR.U32 tid32, tid32, 1;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid32;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:1:-:1  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero];
+
+--:-:2:-:1  @P1 LDG.E.128 loadI00, [trackI + 4x< 0>];
+--:-:3:-:1  @P1 LDG.E.128 loadI04, [trackI + 4x<32>];
+--:-:4:-:1  @P1 LDG.E.128 loadI08, [trackI + 4x<64>];
+--:-:5:-:1  @P1 LDG.E.128 loadI12, [trackI + 4x<96>];
+--:-:-:-:1 @!P1 LDS.U.128 loadI00, [addr_zero];
+--:-:-:-:1 @!P1 LDS.U.128 loadI04, [addr_zero];
+--:-:-:-:1 @!P1 LDS.U.128 loadI08, [addr_zero];
+--:-:6:-:2 @!P1 LDS.U.128 loadI12, [addr_zero];
+
+21:-:-:-:1      STS.128 [writeFs], loadF0;
+
+02:-:-:-:1      STS.128 [writeIs + 4x< 0>], loadI00;
+04:-:-:-:1      STS.128 [writeIs + 4x<32>], loadI04;
+08:-:-:-:1      STS.128 [writeIs + 4x<64>], loadI08;
+10:1:-:-:1      STS.128 [writeIs + 4x<96>], loadI12;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:-:-:1  @P1 LDG.E.128 loadI00, [trackI + 4x< 0>];
+--:-:3:-:1  @P1 LDG.E.128 loadI04, [trackI + 4x<32>];
+--:-:-:-:1  @P1 LDG.E.128 loadI08, [trackI + 4x<64>];
+--:5:4:-:1  @P1 LDG.E.128 loadI12, [trackI + 4x<96>];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c40 => "02:-:-:-:1  \@P0 STS.128 [writeFs], loadF0;\n",
+
+        j1c62 => "--:-:2:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "02:-:-:-:1  \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF];\n",
+
+        j3c8  => "04:-:-:-:1  \@P0 STS.128 [writeIs + 4x< 0>], loadI00;\n",
+        j3c10 => "--:3:-:-:1  \@P0 STS.128 [writeIs + 4x<32>], loadI04;\n",
+
+        j3c55 => "10:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j3c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j4c8  => "04:-:-:-:1  \@P1 LDG.E.128 loadI00, [trackI + 4x< 0>];\n",
+        j4c10 => "--:-:3:-:1  \@P1 LDG.E.128 loadI04, [trackI + 4x<32>];\n",
+
+        j6c8  => "08:-:-:-:1  \@P0 STS.128 [writeIs + 4x<64>], loadI08;\n",
+        j6c10 => "--:4:-:-:1  \@P0 STS.128 [writeIs + 4x<96>], loadI12;\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "08:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c8  => "--:-:-:-:1  \@P1 LDG.E.128 loadI08, [trackI + 4x<64>];\n",
+        j7c10 => "--:5:4:-:1  \@P1 LDG.E.128 loadI12, [trackI + 4x<96>];\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2 + (tid & 32) << 1
+// tidOY = (tid & 31) >> 3
+--:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      LOP.AND tidOX2, tid,    32;
+--:-:-:-:1      SHL     tidOX,  tidOX,  2;
+--:-:-:-:1      ISCADD  tidOX,  tidOX2, tidOX, 1;
+--:-:-:-:1      LOP.AND tidOY,  tid,    31;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+--:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*32 + tidOY * 4
+--:-:-:-:1      SHL       tidOY, tidOY, 2;
+--:-:-:-:1      ISCADD k, idx_K, tidOY, 5;
+
+[+ output_setup(63, 1, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X64_N128.sass b/Kernel/Convolution/Pascal/sconv_xprop_X64_N128.sass
new file mode 100644
index 0000000..b782b8a
--- /dev/null
+++ b/Kernel/Convolution/Pascal/sconv_xprop_X64_N128.sass
@@ -0,0 +1,253 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 128;
+    our $shareF = 64;
+    our $stepI  = 64;
+    our $stepF  = 32;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 128*8*2 + 0>
+    szShareF  : (64*8)
+    szShareI  : (128*8)
+
+    addr_zero  : 4x<64*8*2 + 128*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 128*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 128*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 128*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 128*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 128*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 128*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 128*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-67 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-111 ~ tid1, tid64, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-111 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-111 : loadI<0-7>,  loadF<0-3>
+
+    112-113 : sliceI, sliceF
+    112-113 : sliceIF<0-1>
+
+    108-111 ~ offsetF, offsetIc, offsetFc
+
+    114-124 ~ writeFs, writeIs, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI
+    125-127 ~ readFs, readIs, swapBuf
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-124  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, tidOX2, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+[+ get_mpqk() +]
+
+// tidX = (tid & 15) << 2
+// tidY = tid >> 4
+--:-:-:-:1      LOP.AND tidX, tid,  15;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  4;
+
+// trackF += blkF*64 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 6;
+
+// trackI += blkI*128 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 7;
+
+// writeFs = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeFs, tidY, tidX, 6;
+--:-:-:-:1      SHL     writeFs, writeFs, 2;
+
+// writeIs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeIs, tidY, tidX, 7;
+--:-:-:-:1      ISCADD  writeIs, writeIs, 4x<szShareF>, 2;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+// readFs = ((tid & 48) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    48;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:1      SHL     readFs, readFs, 4;
+
+// readIs = ((tid & 64) >> 3) | ((tid >> 1) & 7)
+--:-:-:-:1      LOP.AND tid64,  tid,    64;
+--:-:-:-:1      SHR.U32 tid64,  tid64,  3;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid64;
+--:-:-:-:0      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:5:-:1 @!P1 LDS.U.128    loadF0, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128 loadI0, [trackI + 4x<00>];
+--:-:4:-:1  @P1 LDG.E.128 loadI4, [trackI + 4x<64>];
+--:-:-:-:1 @!P1 LDS.U.128 loadI0, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 loadI4, [addr_zero];
+
+12:-:-:-:1      STS.128 [writeFs], loadF0;
+
+24:-:-:-:1      STS.128 [writeIs + 4x<00>], loadI0;
+08:1:-:-:1      STS.128 [writeIs + 4x<64>], loadI4;
+
+[+ loop_setup() +]
+
+--:-:2:-:2  @P1 LDG.E.CI.128 loadF0, [trackF];
+--:-:3:-:1  @P1 LDG.E.128 loadI0, [trackI + 4x<00>];
+--:5:4:-:1  @P1 LDG.E.128 loadI4, [trackI + 4x<64>];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c17 => "--:-:6:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j1c40 => "02:2:-:-:1  \@P0 STS.128 [writeFs], loadF0;\n",
+
+        j2c10 => "02:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "20:-:-:-:1  \@P1 IADD3    offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3    offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF];\n",
+
+        j3c8  => "04:3:-:-:1  \@P0 STS.128 [writeIs + 4x<00>], loadI0;\n",
+
+        j3c55 => "10:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j3c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j4c8  => "04:-:3:-:1  \@P1 LDG.E.128 loadI0, [trackI + 4x<00>];\n",
+
+        j6c8  => "08:4:-:-:1  \@P0 STS.128 [writeIs + 4x<64>], loadI4;\n",
+
+        j6c63   => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                   "08:-:-:-:1  \@P0 IADD readIs,  readIs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeIs, writeIs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
+                   "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c8  => "--:5:4:-:1  \@P1 LDG.E.128 loadI4, [trackI + 4x<64>];\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2 + (tid & 64) >> 1
+// tidOY = (tid & 63) >> 3
+02:-:-:-:1      LOP.AND tidOX,  tid,    7;
+--:-:-:-:1      SHL     tidOX,  tidOX,  2;
+--:-:-:-:1      LOP.AND tidOX2, tid,    64;
+--:-:-:-:1      SHR.U32 tidOX2, tidOX2, 1;
+--:-:-:-:1      LOP.OR  tidOX,  tidOX,  tidOX2;
+--:-:-:-:1      LOP.AND tidOY,  tid,    63;
+--:-:-:-:1      SHR.U32 tidOY,  tidOY,  3;
+
+--:-:-:-:1      ISETP.GT.AND P2, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readIs,  readIs, -4x<szShareF>;
+--:-:-:-:1  @P2 IADD readFs,  readFs, -swapBuf;
+--:-:-:-:1  @P2 IADD readIs,  readIs, -swapBuf;
+
+// Div by 4 here collapses k stride
+// writeCs = (readFs / 4) * 128 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 5;
+
+// readCs  = 4 * (tidOX + (tidOY * 128))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 7;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*128 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 7;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*64 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 6;
+
+[+ output_setup(31, 1, 5) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Pascal/sconv_xprop_X64_N64.sass b/Kernel/Convolution/Pascal/sconv_xprop_X64_N64.sass
new file mode 100644
index 0000000..b42fbea
--- /dev/null
+++ b/Kernel/Convolution/Pascal/sconv_xprop_X64_N64.sass
@@ -0,0 +1,240 @@
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our $prefix = 's';
+    our $shareI = 64;
+    our $shareF = 64;
+    our $stepI  = 32;
+    our $stepF  = 32;
+-]
+
+<INCLUDE file="xconv_xprop_common.sass"/>
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<64*8*2 + 64*8*2 + 0>
+    szShareF  : (64*8)
+    szShareI  : (64*8)
+
+    addr_zero  : 4x<64*8*2 + 64*8*2 + 0>
+    addr_mpqk  : 4x<64*8*2 + 64*8*2 + 4>
+    addr_m     : 4x<64*8*2 + 64*8*2 + 4>
+    addr_p     : 4x<64*8*2 + 64*8*2 + 5>
+    addr_q     : 4x<64*8*2 + 64*8*2 + 6>
+    addr_k     : 4x<64*8*2 + 64*8*2 + 7>
+    addr_szLut : 4x<64*8*2 + 64*8*2 + 8>
+    addr_lut   : 4x<64*8*2 + 64*8*2 + 10>
+
+[+ params() +]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+
+      64-67 : mpqk<0-3>
+      64-66 : m, p, q
+      64-71 : idx_M, idx_P, idx_Q, idx_K, idx_N, tid, tidY, negOne
+     72-113 ~ tid1, tidX, idx_MPQk, idx_PQk, idx_Qk, idx_k, magic_PQk, magic_Qk, neg_PQk, neg_Qk, neg_k, div1, div2, div3, idx_P2, idx_Q2, q1, q2
+     72-113 ~ mask_shr, rst, lutStore, lutStore2, warp_count, mt, pr, qs, dep_thd_mask, dep_thd_bits, dep_thd_cnt, t, r, s, rs, x, y, z, ballot, warp_slices, partial, endCRST, str_d, str_h, str_w, rst_prime, x_prime, y_prime, z_prime
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      96-99 : trackI<0-1>, trackF<0-1>
+    100-115 : loadI<0-7>,  loadF<0-7>
+
+    108-113 ~ offsetF, offsetIc, offsetFc
+    114-115 : sliceI, sliceF
+    114-115 : sliceIF<0-1>
+
+    116-125 ~ writeS, offsetIn, offsetFk, posCRST, posCRSTf, channel, lutSize, lutSizeRcp, lutOffset, offsetI
+    126-127 ~ readFs, readIs
+
+    72-91   : cs<0-7>, c<0-3>, b<0-7>
+    72-83   ~ x<0-7>
+    92-99   : out<0-7>
+   100-101  : Out<0-1>
+   102-103  : Sum<0-1>
+   104-125  ~ writeCs, readCs, alpha, k, n, sum<0-3>, offset, out_offset, bsum_offset, tidOX, tidOY, preds, one
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_MPQk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,    SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,    SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;
+
+[+ load_zeros() +]
+
+[+ get_mpqk() +]
+
+// tidX = (tid & 7) << 2
+// tidY = tid >> 3
+--:-:-:-:1      LOP.AND tidX, tid,  7;
+--:-:-:-:1      SHL     tidX, tidX, 2;
+--:-:-:-:1      SHR.U32 tidY, tid,  3;
+
+// trackF += blkF*64 + tidX
+--:-:-:-:1      ISCADD  offsetFk, idx_K, tidX, 6;
+
+// trackI += blkI*64 + tidX
+08:-:-:-:1      ISCADD  offsetIn, idx_N, tidX, 6;
+
+// writeS = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 6;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+// readFs  = (((tid & -16) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   -16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+--:-:-:-:0      SHL     readFs, readFs, 4;
+
+// readIs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readIs, readIs, 4x<szShareF>, 4;
+</SCHEDULE_BLOCK>
+
+[+ load_lut() +]
+
+--:-:1:-:1  @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>];
+--:-:-:-:1 @!P1 LDS.U.128 loadF0, [addr_zero];
+--:-:5:-:2 @!P1 LDS.U.128 loadF4, [addr_zero];
+
+--:-:3:-:1  @P1 LDG.E.128 loadI0, [trackI + 4x< 0>];
+--:-:4:-:1  @P1 LDG.E.128 loadI4, [trackI + 4x<32>];
+--:-:-:-:1 @!P1 LDS.U.128 loadI0, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 loadI4, [addr_zero];
+
+11:-:-:-:1      STS.128 [writeS + 4x<0*64 +  0>], loadF0;
+02:-:-:-:1      STS.128 [writeS + 4x<0*64 + 32>], loadF4;
+
+24:-:-:-:1      STS.128 [writeS + 4x<8*64 +  0>], loadI0;
+08:1:-:-:1      STS.128 [writeS + 4x<8*64 + 32>], loadI4;
+
+[+ loop_setup() +]
+
+--:-:2:-:1  @P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];
+--:-:3:-:1  @P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>];
+--:-:4:-:1  @P1 LDG.E.128 loadI0, [trackI + 4x< 0>];
+--:-:5:-:1  @P1 LDG.E.128 loadI4, [trackI + 4x<32>];
+
+[-
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P1, PT, posCRST,  RZ, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, posCRST, -8, PT;\n",
+
+        j0c13 => "--:-:6:-:1  \@P1 I2F.F32.S32 posCRSTf, posCRST;\n",
+
+        j0c39 => "20:-:-:-:1  \@P1 FMUL channel, posCRSTf, lutSizeRcp;\n",
+        j0c44 => "--:-:-:-:1  \@P1 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c46 => "--:-:6:-:1  \@P1 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        j1c8  => "20:-:-:-:1  \@P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;\n",
+        j1c13 => "--:-:-:-:1  \@P1 SHL lutOffset, lutOffset, 3;\n",
+
+        j1c37 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 +  0>], loadF0;\n",
+        j1c39 => "04:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*64 + 32>], loadF4;\n",
+
+        j1c62 => "02:-:2:-:1  \@P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];\n",
+
+        j2c10 => "--:-:-:-:1  \@P1 XMAD     offsetFc, channel, param_KRST, RZ;\n",
+        j2c15 => "--:-:-:-:1  \@P1 XMAD     offsetIc, channel, param_DHWN,    RZ;\n",
+        j2c20 => "--:-:-:-:1  \@P1 XMAD.PSL offsetIc, channel, param_DHWN.H1, offsetIc;\n",
+        j2c22 => "--:-:-:-:1      IADD posCRST, posCRST, -8;\n",
+
+        j2c29 => "02:-:-:-:1  \@P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;\n",
+        j2c34 => "--:-:-:-:1  \@P1 LEA      trackF0.CC, offsetF, param_F[0],     2;\n",
+        j2c36 => "--:-:-:-:1  \@P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;\n",
+        j2c38 => "--:-:-:-:1  \@P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, 2;\n",
+
+        j2c40 => "--:-:2:-:1  \@P1 LDG.E.CI.128 loadF0, [trackF + 4x< 0>];\n",
+        j2c42 => "--:-:3:-:1  \@P1 LDG.E.CI.128 loadF4, [trackF + 4x<32>];\n",
+
+        j6c8  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 +  0>], loadI0;\n",
+        j6c10 => "10:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*64 + 32>], loadI4;\n",
+
+        j6c55 => "--:-:-:-:1  \@P1 LEA      trackI0.CC, offsetI, param_I[0],     2;\n",
+        j6c60 => "--:-:-:-:1  \@P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, 2;\n",
+
+        j6c62 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "08:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<64*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<64*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<64*8*2>;\n",
+
+        j7c8  => "--:-:4:-:1  \@P1 LDG.E.128 loadI0, [trackI + 4x< 0>];\n",
+        j7c10 => "--:-:5:-:1  \@P1 LDG.E.128 loadI4, [trackI + 4x<32>];\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+-]
+
+LOOP:
+
+[+ main_loop() +]
+
+--:-:1:-:1      LDS.U.128 mpqk, [addr_mpqk];
+--:-:2:-:1      S2R tid,   SR_TID.X;
+--:-:3:-:1      S2R idx_N, SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+// tidOX = (tid & 7) << 2
+// tidOY = tid >> 3
+02:-:-:-:1      LOP.AND tidOX, tid,   7;
+--:-:-:-:1      SHL     tidOX, tidOX, 2;
+--:-:-:-:1      SHR.U32 tidOY, tid,   3;
+
+--:-:-:-:1      LOP.AND readIs, readIs, 0x7ff;
+--:-:-:-:1      LOP.AND readFs, readFs, 0x7ff;
+
+// Div by 4 here collapses k stride
+// writeCs = (readKs / 4) * 64 + readNs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 4;
+
+// readCs  = 4 * (tidOX + (tidOY * 64))
+--:-:-:-:1      ISCADD readCs, tidOY, tidOX, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = blkI*64 + tidOX;
+04:-:-:-:1      ISCADD n, idx_N, tidOX, 6;
+
+// Mul by 4 here expands k stride back out
+// k = blkF*64 + tidOY * 4
+--:-:-:-:1      SHL    tidOY,   tidOY, 2;
+01:-:-:-:1      ISCADD k, idx_K, tidOY, 6;
+
+[+ output_setup(63, 0, 6) +]
+
+</SCHEDULE_BLOCK>
+
+[+ output() +]
\ No newline at end of file
diff --git a/Kernel/Convolution/Pascal/xconv_direct_updat_64x32.sass b/Kernel/Convolution/Pascal/xconv_direct_updat_64x32.sass
new file mode 100644
index 0000000..803487e
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_direct_updat_64x32.sass
@@ -0,0 +1,1077 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our ($type, $SN, $D);
+our $determ = $D;
+our $largeN = !$SN;
+our $dtype        = $type eq 'h' ?        '.U16' : '';
+our $convert_in   = $type eq 'h' ? 'F2F.F32.F16' : '';
+our $convert_out  = $type eq 'h' ? 'F2F.F16.F32' : '';
+our $vec_size     = $type eq 'h' ?          '64' : '128';
+our $dtype_shift  = $type eq 'h' ?           '1' : '2';
+our $dtype_size   = $type eq 'h' ?           '2' : '4';
+sub dtype       { return $dtype;       }
+sub dtype_shift { return $dtype_shift; }
+sub vec_size    { return $vec_size; }
+sub output_op   { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero  : 4x<(32 + 64)*33*2>
+    szShareI   : (64*33)
+    szShareE   : (32*33)
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_C            : c[0x0][0x15c]
+    param_D            : c[0x0][0x160]
+    param_H            : c[0x0][0x164]
+    param_W            : c[0x0][0x168]
+    param_N            : c[0x0][0x16c]
+    param_K            : c[0x0][0x170]
+    param_M            : c[0x0][0x174]
+    param_P            : c[0x0][0x178]
+    param_Q            : c[0x0][0x17c]
+    param_str_d        : c[0x0][0x180]
+    param_str_h        : c[0x0][0x184]
+    param_str_w        : c[0x0][0x188]
+    param_pad_d        : c[0x0][0x18c]
+    param_pad_h        : c[0x0][0x190]
+    param_pad_w        : c[0x0][0x194]
+    param_dil_d        : c[0x0][0x198]
+    param_dil_h        : c[0x0][0x19c]
+    param_dil_w        : c[0x0][0x1a0]
+    param_DHWN         : c[0x0][0x1a4]
+    param_HWN          : c[0x0][0x1a8]
+    param_WN           : c[0x0][0x1ac]
+    param_MPQN16p      : c[0x0][0x1b0]
+    param_MPQN         : c[0x0][0x1b4]
+    param_PQN          : c[0x0][0x1b8]
+    param_QN           : c[0x0][0x1bc]
+    param_PQkc         : c[0x0][0x1c0]
+    param_Qkc          : c[0x0][0x1c4]
+    param_kc           : c[0x0][0x1c8]
+    param_c            : c[0x0][0x1cc]
+    param_k            : c[0x0][0x1d0]
+    param_magic_PQkc   : c[0x0][0x1d4]
+    param_shift_PQkc   : c[0x0][0x1d8]
+    param_magic_Qkc    : c[0x0][0x1dc]
+    param_shift_Qkc    : c[0x0][0x1e0]
+    param_magic_kc     : c[0x0][0x1e4]
+    param_shift_kc     : c[0x0][0x1e8]
+    param_magic_c      : c[0x0][0x1ec]
+    param_shift_c      : c[0x0][0x1f0]
+    param_CTRSK        : c[0x0][0x1f4]
+    param_CTRS         : c[0x0][0x1f8]
+    param_TRS          : c[0x0][0x1fc]
+    param_RS           : c[0x0][0x200]
+    param_S            : c[0x0][0x204]
+    param_magic_TRS    : c[0x0][0x208]
+    param_shift_TRS    : c[0x0][0x20c]
+    param_magic_RS     : c[0x0][0x210]
+    param_shift_RS     : c[0x0][0x214]
+    param_magic_S      : c[0x0][0x218]
+    param_shift_S      : c[0x0][0x21c]
+    param_superM       : c[0x0][0x220]
+    param_superP       : c[0x0][0x224]
+    param_superQ       : c[0x0][0x228]
+    param_superN       : c[0x0][0x22c]
+    param_shiftM       : c[0x0][0x230]
+    param_shiftP       : c[0x0][0x234]
+    param_shiftQ       : c[0x0][0x238]
+    param_strideP      : c[0x0][0x23c]
+    param_strideQ      : c[0x0][0x240]
+    param_stridePQ     : c[0x0][0x244]
+    param_gridP        : c[0x0][0x248]
+    param_gridQ        : c[0x0][0x24c]
+    param_loopX        : c[0x0][0x250]
+    param_loopXp       : c[0x0][0x254]
+    param_loopQ        : c[0x0][0x258]
+    param_loopQp       : c[0x0][0x25c]
+    param_loopN        : c[0x0][0x260]
+    param_loopNp       : c[0x0][0x264]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+
+      64-79 : j0Ex<0-7>, j0Iy<0-7>
+      80-95 : j1Ex<0-7>, j1Iy<0-7>
+
+     96-119 : I0<0-3>, I1<0-3>, I2<0-3>, I3<0-3>, E0<0-3>, E1<0-3>
+    120-131 : track0I<0-1>,  track1I<0-1>, track2I<0-1>,  track3I<0-1>, track0E<0-1>, track1E<0-1>
+
+     64-131 ~ tid, idx_MPQkc, idx_PQkc, idx_Qkc, idx_kc, idx_k, idx_c, magic_PQkc, magic_Qkc, neg_PQkc, neg_Qkc, neg_kc, neg_c, div1, div2, div3, tidX, tidX4, tidY, tid1, readEs2, tid32, tid32_2, neg_TRS, neg_RS, neg_S, super_m, m, mt, k, k16, ctrs<0-3>, trs<0-3>, rs<0-3>, c<0-3>, t<0-3>, z<0-3>
+
+      80-81 : super_p, super_q
+      80-81 : pr, qs
+      82-95 ~ p, te, pIn, qIn, predEt, ti<0-3>, y<0-3>
+      80-95 ~ loopN, N
+
+    132-167 ~ tid7, q, n, idx_K, idx_C, idx_M, idx_P, start_P, idx_Q, start_Q, writeIs, writeEs, readIs, readEs, swapBuf, writeFs, predI, predE, init, x<0-3>, czOffset<0-3>, r<0-3>, s<0-3>, kmOffset
+
+     96-103 : track00F<0-1>, track04F<0-1>, track08F<0-1>, track12F<0-1>
+    104-119 ~ f00_<0-3>, f04_<0-3>, f08_<0-3>, f12_<0-3>
+    104-119 ~ Tid, tid_31, tid_32, K, K16, tf, idx_MPQ, xmad_determ
+    120-131 ~ alpha, readFs, K1, kk, crst<00|04|08|12>
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,       SR_TID.X;
+--:-:2:-:1      S2R idx_MPQkc, SR_CTAID.X;
+--:-:3:-:1      S2R idx_C,     SR_CTAID.Y;
+--:-:4:-:1      S2R idx_K,     SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// idx_M = idx_MPQkc / blk_PQkc
+--:-:-:-:1      MOV  magic_PQkc, param_magic_PQkc;
+--:-:-:-:1      ISETP.NE.AND P0, PT,   magic_PQkc, 1, PT;
+02:-:-:-:1  @P0 XMAD     div1, idx_MPQkc,    magic_PQkc,    RZ;
+--:-:-:-:1  @P0 XMAD     div2, idx_MPQkc,    magic_PQkc.H1, RZ;
+--:-:-:-:1  @P0 XMAD     div3, idx_MPQkc.H1, magic_PQkc.H1, RZ;
+--:-:-:-:1  @P0 XMAD.CHI div1, idx_MPQkc.H1, magic_PQkc,    div1;
+--:-:-:-:1  @P0 IADD3.RS idx_M, div1, div2, div3;
+--:-:-:-:1  @P0 SHR.U32  idx_M, idx_M,     param_shift_PQkc;
+--:-:-:-:1 @!P0 SHR.U32  idx_M, idx_MPQkc, param_shift_PQkc;
+
+// idx_PQkc = idx_PQkc % blk_Qkc
+--:-:-:-:1      IADD neg_PQkc, RZ, -param_PQkc;
+--:-:-:-:1      XMAD.LO2 idx_PQkc, neg_PQkc, idx_M, idx_MPQkc;
+
+// idx_P = idx_PQkc / blk_Qkc
+--:-:-:-:1      MOV  magic_Qkc, param_magic_Qkc;
+--:-:-:-:1      ISETP.NE.AND P1, PT,  magic_Qkc, 1, PT;
+--:-:-:-:1  @P1 XMAD     div1, idx_PQkc,    magic_Qkc,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, idx_PQkc,    magic_Qkc.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, idx_PQkc.H1, magic_Qkc.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, idx_PQkc.H1, magic_Qkc,    div1;
+--:-:-:-:1  @P1 IADD3.RS idx_P, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  idx_P, idx_P,    param_shift_Qkc;
+--:-:-:-:1 @!P1 SHR.U32  idx_P, idx_PQkc, param_shift_Qkc;
+
+// idx_Qkc = idx_PQkc % blk_Qkc
+--:-:-:-:1      IADD neg_Qkc, RZ, -param_Qkc;
+--:-:-:-:1      XMAD.LO2 idx_Qkc, neg_Qkc, idx_P, idx_PQkc;
+
+// idx_Q  = idx_Qkc / kc
+--:-:-:-:1      XMAD.LO2C idx_Q, idx_Qkc, param_magic_kc, RZ;
+--:-:-:-:1      SHR.U32   idx_Q, idx_Q,   param_shift_kc;
+// idx_kc = idx_Qkc % kc
+--:-:-:-:1      IADD neg_kc, RZ, -param_kc;
+--:-:-:-:1      XMAD.S16.U16  idx_kc, neg_kc, idx_Q, idx_Qkc;
+
+// idx_k = idx_kc / c
+--:-:-:-:1      XMAD    idx_k,  idx_kc, param_magic_c, RZ;
+--:-:-:-:1      SHR.U32 idx_k,  idx_k,  param_shift_c;
+// idx_c = idx_kc % c
+--:-:-:-:1      IADD neg_c, RZ, -param_c;
+--:-:-:-:1      XMAD.S16.U16 idx_c, neg_c, idx_k, idx_kc;
+
+// idx_C = idx_C * blk_c + idx_c
+// idx_K = idx_K * blk_k + idx_k
+04:-:-:-:1      XMAD idx_C, idx_C, param_c, idx_c;
+08:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+--:-:-:-:1      MOV start_P, idx_P;
+--:-:-:-:1      MOV start_Q, idx_Q;
+
+// tidX   = tid >> 3
+// tidY   = (tid & 7) << 2
+// shiftX = tidY
+01:-:-:-:1      SHR.U32 tidX,   tid,  3;
+--:-:-:-:1      LOP.AND tid7,   tid,  7;
+--:-:-:-:1      SHL     tidY,   tid7, 2;
+
+// writeIs = (tidY*64 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeIs, tidY, tidX, 6;
+--:-:-:-:1      IADD   writeIs, writeIs, tidY;
+--:-:-:-:1      SHL    writeIs, writeIs, 2;
+
+// writeEs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeEs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeEs, writeEs, tidY;
+--:-:-:-:1      ISCADD writeEs, writeEs, 4x<szShareI>, 2;
+
+// readEs  = (((tid >> 1) & 3) << 4
+--:-:-:-:1      BFE.U32 readEs, tid, 0x201; // 2 bits at position 1
+
+// readIs = (((tid & 24) >> 2) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readIs, tid,   24;
+--:-:-:-:1      SHR.U32 readIs, readIs, 2;
+--:-:-:-:1      LOP.OR  readIs, readIs, tid1;
+
+// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5)
+// tid32 = tid & -32
+--:-:-:-:1      LOP.AND tid32, tid, -32;
+
+// readEs2 = readEs + (tid32 >> 2) + (readIs << 2)
+--:-:-:-:1      SHR.U32 tid32_2, tid32, 2;
+--:-:-:-:1      IADD    readEs2, tid32_2, readEs;
+--:-:-:-:1      ISCADD  readEs2, readIs, readEs2, 2;
+
+--:-:-:-:1      SHL readIs,  readIs,  4;
+--:-:-:-:1      SHL readEs,  readEs,  4;
+--:-:-:-:1      SHL readEs2, readEs2, 4;
+
+// writeFs = readIs*32*4 + readEs2
+--:-:-:-:1      ISCADD writeFs, readIs, readEs2, 7;
+
+// Each block of 32 threads works on 8 lines,
+// Also shift over each 8 lines by 8 (cumulative)
+// readIs += tid32/4 * 64 * 4 + tid32/4 * 4
+// readEs += tid32/4 * 32 * 4 + tid32/4 * 4 + 4x<szShareI>
+--:-:-:-:1      ISCADD readIs, tid32,  readIs, 6;
+--:-:-:-:1      ISCADD readEs, tid32,  readEs, 5;
+--:-:-:-:1      IADD   readIs, readIs, tid32;
+--:-:-:-:1      IADD3  readEs, readEs, 4x<szShareI>, tid32;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareI + szShareE>;
+
+// Remap ctrs for better L1 cache performance with small N
+// Maximize the amount of overlapping data requested within a warp.
+// The L1 is partitioned in to 2 groups of 2 warps.
+// ctrs = idx_C*64 + tidX*4
+--:-:-:-:1      SHL    tidX4, tidX,  2;
+--:-:-:-:1      ISCADD ctrs0, idx_C, tidX4, 6;
+--:-:-:-:1      IADD   ctrs1, ctrs0, 1;
+--:-:-:-:1      IADD   ctrs2, ctrs0, 2;
+--:-:-:-:1      IADD   ctrs3, ctrs0, 3;
+
+// c   = ctrs / RST
+--:-:-:-:1      XMAD.LO2C c0, ctrs0, param_magic_TRS, RZ;
+--:-:-:-:1      XMAD.LO2C c1, ctrs1, param_magic_TRS, RZ;
+--:-:-:-:1      XMAD.LO2C c2, ctrs2, param_magic_TRS, RZ;
+--:-:-:-:1      XMAD.LO2C c3, ctrs3, param_magic_TRS, RZ;
+--:-:-:-:1      SHR.U32   c0,    c0, param_shift_TRS;
+--:-:-:-:1      SHR.U32   c1,    c1, param_shift_TRS;
+--:-:-:-:1      SHR.U32   c2,    c2, param_shift_TRS;
+--:-:-:-:1      SHR.U32   c3,    c3, param_shift_TRS;
+// trs = ctrs % RST
+--:-:-:-:1      IADD neg_TRS, RZ, -param_TRS;
+--:-:-:-:1      XMAD.S16.U16 trs0, neg_TRS, c0, ctrs0;
+--:-:-:-:1      XMAD.S16.U16 trs1, neg_TRS, c1, ctrs1;
+--:-:-:-:1      XMAD.S16.U16 trs2, neg_TRS, c2, ctrs2;
+--:-:-:-:1      XMAD.S16.U16 trs3, neg_TRS, c3, ctrs3;
+
+// t =  trs / RS
+--:-:-:-:1      XMAD    t0, trs0, param_magic_RS, RZ;
+--:-:-:-:1      XMAD    t1, trs1, param_magic_RS, RZ;
+--:-:-:-:1      XMAD    t2, trs2, param_magic_RS, RZ;
+--:-:-:-:1      XMAD    t3, trs3, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32 t0,   t0, param_shift_RS;
+--:-:-:-:1      SHR.U32 t1,   t1, param_shift_RS;
+--:-:-:-:1      SHR.U32 t2,   t2, param_shift_RS;
+--:-:-:-:1      SHR.U32 t3,   t3, param_shift_RS;
+// rs = trs % RS
+--:-:-:-:1      IADD neg_RS, RZ, -param_RS;
+--:-:-:-:1      XMAD.S16.U16 rs0, neg_RS, t0, trs0;
+--:-:-:-:1      XMAD.S16.U16 rs1, neg_RS, t1, trs1;
+--:-:-:-:1      XMAD.S16.U16 rs2, neg_RS, t2, trs2;
+--:-:-:-:1      XMAD.S16.U16 rs3, neg_RS, t3, trs3;
+
+// r = rs / S
+--:-:-:-:1      XMAD    r0, rs0, param_magic_S, RZ;
+--:-:-:-:1      XMAD    r1, rs1, param_magic_S, RZ;
+--:-:-:-:1      XMAD    r2, rs2, param_magic_S, RZ;
+--:-:-:-:1      XMAD    r3, rs3, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32 r0,  r0, param_shift_S;
+--:-:-:-:1      SHR.U32 r1,  r1, param_shift_S;
+--:-:-:-:1      SHR.U32 r2,  r2, param_shift_S;
+--:-:-:-:1      SHR.U32 r3,  r3, param_shift_S;
+// s = rs % S
+--:-:-:-:1      IADD neg_S, RZ, -param_S;
+--:-:-:-:1      XMAD.S16.U16 s0, neg_S, r0, rs0;
+--:-:-:-:1      XMAD.S16.U16 s1, neg_S, r1, rs1;
+--:-:-:-:1      XMAD.S16.U16 s2, neg_S, r2, rs2;
+--:-:-:-:1      XMAD.S16.U16 s3, neg_S, r3, rs3;
+
+--:-:-:-:1      LOP.AND n, tid, param_superN;
+--:-:-:-:1      SHL n, n, 2;
+
+// M,C,K are static coords so compute offsets and predicates once
+--:-:-:-:1      SHL m, idx_M, param_shiftM;
+--:-:-:-:1      BFE.U32 super_m, tid7, param_superM;
+--:-:-:-:1      IADD m, m, super_m;
+
+// z = m * str_d - pad_d + (t * dil_d)
+--:-:-:-:1      XMAD  mt, m,   param_str_d, RZ;
+
+--:-:-:-:1      XMAD  z0, t0,  param_dil_d, mt;
+--:-:-:-:1      XMAD  z1, t1,  param_dil_d, mt;
+--:-:-:-:1      XMAD  z2, t2,  param_dil_d, mt;
+--:-:-:-:1      XMAD  z3, t3,  param_dil_d, mt;
+--:-:-:-:1      IADD  z0, z0, -param_pad_d;
+--:-:-:-:1      IADD  z1, z1, -param_pad_d;
+--:-:-:-:1      IADD  z2, z2, -param_pad_d;
+--:-:-:-:1      IADD  z3, z3, -param_pad_d;
+
+// czOffset = c*DHWN + z*HWN
+--:-:-:-:1      XMAD.LO2C czOffset0, c0, param_DHWN, RZ;
+--:-:-:-:1      XMAD.LO2C czOffset1, c1, param_DHWN, RZ;
+--:-:-:-:1      XMAD.LO2C czOffset2, c2, param_DHWN, RZ;
+--:-:-:-:1      XMAD.LO2C czOffset3, c3, param_DHWN, RZ;
+--:-:-:-:1      XMAD.S16.U16.LO2C czOffset0, z0, param_HWN,  czOffset0;
+--:-:-:-:1      XMAD.S16.U16.LO2C czOffset1, z1, param_HWN,  czOffset1;
+--:-:-:-:1      XMAD.S16.U16.LO2C czOffset2, z2, param_HWN,  czOffset2;
+--:-:-:-:1      XMAD.S16.U16.LO2C czOffset3, z3, param_HWN,  czOffset3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, c0, param_C, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, c1, param_C, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, c2, param_C, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, c3, param_C, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, z0, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_D, P1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_D, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_D, P3;
+--:-:-:-:1      ISETP.GE.AND P0, PT, z0, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+--:-:-:-:1      P2R predI, PR, RZ, 0x0f;
+--:-:-:-:1      SHL predI, predI, 8;
+
+// k = idx_K*32 + tidX
+--:-:-:-:1      ISCADD k, idx_K, tidX, 5;
+
+// kmOffset = k*MPQN + m*PQN
+--:-:-:-:1      XMAD.LO2C kmOffset, k, param_MPQN, RZ;
+--:-:-:-:1      XMAD.LO2C kmOffset, m, param_PQN,  kmOffset;
+
+--:-:-:-:1      IADD k16, k, 16;
+--:-:-:-:1      ISETP.LT.AND P4, PT, m,   param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, k,   param_K, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k16, param_K, P4;
+--:-:-:-:1      P2R predE, PR, RZ, 0x03;
+--:-:-:-:1      SHL predE, predE, 2;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      CAL CALC_OFFSETS;
+--:-:-:-:5      CAL DO_LOADS;
+--:-:-:-:5      CAL CALC_OFFSETS;
+
+[+
+    our $convert_in;
+    return $convert_in ? qq{
+02:-:-:-:1      $convert_in I03, I01.H1;
+--:-:-:-:1      $convert_in I02, I01.H0;
+--:-:-:-:1      $convert_in I01, I00.H1;
+--:-:-:-:1      $convert_in I00, I00.H0;
+
+--:-:-:-:1      $convert_in I13, I11.H1;
+--:-:-:-:1      $convert_in I12, I11.H0;
+--:-:-:-:1      $convert_in I11, I10.H1;
+--:-:2:-:1      $convert_in I10, I10.H0;
+
+04:-:-:-:1      $convert_in I23, I21.H1;
+--:-:-:-:1      $convert_in I22, I21.H0;
+--:-:-:-:1      $convert_in I21, I20.H1;
+--:-:-:-:1      $convert_in I20, I20.H0;
+
+--:-:-:-:1      $convert_in I33, I31.H1;
+--:-:-:-:1      $convert_in I32, I31.H0;
+--:-:-:-:1      $convert_in I31, I30.H1;
+--:-:3:-:1      $convert_in I30, I30.H0;
+
+08:-:-:-:1      $convert_in E03, E01.H1;
+--:-:-:-:1      $convert_in E02, E01.H0;
+--:-:-:-:1      $convert_in E01, E00.H1;
+--:-:4:-:1      $convert_in E00, E00.H0;
+
+10:-:-:-:1      $convert_in E13, E11.H1;
+--:-:-:-:1      $convert_in E12, E11.H0;
+--:-:-:-:1      $convert_in E11, E10.H1;
+--:-:5:-:1      $convert_in E10, E10.H0;
+        } : '';
++]
+
+02:-:-:-:1      STS [writeIs + 4x<0*64 + 0*16>], I00;
+--:-:-:-:1      STS [writeIs + 4x<1*64 + 0*16>], I01;
+--:-:-:-:1      STS [writeIs + 4x<2*64 + 0*16>], I02;
+--:-:-:-:1      STS [writeIs + 4x<3*64 + 0*16>], I03;
+
+--:-:-:-:1      STS [writeIs + 4x<0*64 + 1*16>], I10;
+--:-:-:-:1      STS [writeIs + 4x<1*64 + 1*16>], I11;
+--:-:-:-:1      STS [writeIs + 4x<2*64 + 1*16>], I12;
+--:-:-:-:1      STS [writeIs + 4x<3*64 + 1*16>], I13;
+
+04:-:-:-:1      STS [writeIs + 4x<0*64 + 2*16>], I20;
+--:-:-:-:1      STS [writeIs + 4x<1*64 + 2*16>], I21;
+--:-:-:-:1      STS [writeIs + 4x<2*64 + 2*16>], I22;
+--:-:-:-:1      STS [writeIs + 4x<3*64 + 2*16>], I23;
+
+--:-:-:-:1      STS [writeIs + 4x<0*64 + 3*16>], I30;
+--:-:-:-:1      STS [writeIs + 4x<1*64 + 3*16>], I31;
+--:-:-:-:1      STS [writeIs + 4x<2*64 + 3*16>], I32;
+--:-:-:-:1      STS [writeIs + 4x<3*64 + 3*16>], I33;
+
+08:-:-:-:1      STS [writeEs + 4x<0*32 + 0*16>], E00;
+--:-:-:-:1      STS [writeEs + 4x<1*32 + 0*16>], E01;
+--:-:-:-:1      STS [writeEs + 4x<2*32 + 0*16>], E02;
+--:-:-:-:1      STS [writeEs + 4x<3*32 + 0*16>], E03;
+
+10:-:-:-:1      STS [writeEs + 4x<0*32 + 1*16>], E10;
+--:-:-:-:1      STS [writeEs + 4x<1*32 + 1*16>], E11;
+--:-:-:-:1      STS [writeEs + 4x<2*32 + 1*16>], E12;
+--:-:-:-:1      STS [writeEs + 4x<3*32 + 1*16>], E13;
+
+// init = bNextY ? 1 : 0
+--:-:-:-:0      SEL init, RZ, 1, !P6;
+
+--:-:-:-:5      BAR.SYNC 0;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD writeEs, writeEs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy4, [readIs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*32 + 16>];
+</SCHEDULE_BLOCK>
+--:-:-:-:5      CAL DO_LOADS;
+
+// init += bNextY ? 1 : 0
+--:-:-:-:0  @P6 IADD init, init, 1;
+
+--:-:-:-:5      CAL CALC_OFFSETS;
+--:-:-:-:5      BRA.U MAIN_LOOP;
+
+DO_LOADS:
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+--:-:-:-:1      R2P PR, predI, 0x0f;
+--:-:2:-:1  @P0 LDG.E.CI.[+ vec_size() +] I0, [track0I];
+--:-:2:-:1  @P1 LDG.E.CI.[+ vec_size() +] I1, [track1I];
+--:-:3:-:1  @P2 LDG.E.CI.[+ vec_size() +] I2, [track2I];
+--:-:3:-:1  @P3 LDG.E.CI.[+ vec_size() +] I3, [track3I];
+--:-:-:-:1 @!P0 LDS.U.[+ vec_size() +]    I0, [addr_zero];
+--:-:-:-:1 @!P1 LDS.U.[+ vec_size() +]    I1, [addr_zero];
+--:-:-:-:1 @!P2 LDS.U.[+ vec_size() +]    I2, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.[+ vec_size() +]    I3, [addr_zero];
+
+--:-:-:-:1      R2P PR, predE, 0x03;
+--:-:4:-:1  @P0 LDG.E.CI.[+ vec_size() +] E0, [track0E];
+--:6:5:-:1  @P1 LDG.E.CI.[+ vec_size() +] E1, [track1E];
+--:-:-:-:1 @!P0 LDS.U.[+ vec_size() +]    E0, [addr_zero];
+--:-:2:-:1 @!P1 LDS.U.[+ vec_size() +]    E1, [addr_zero];
+</ORDERED>
+
+// Advance offset/preds
+--:-:-:-:1      IADD n, n, param_loopN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+--:-:-:-:1 @!P4 LOP.AND n, tid7, param_superN;
+--:-:-:-:1 @!P4 SHL n, n, 2;
+--:-:-:-:1 @!P4 IADD idx_Q, idx_Q, param_strideQ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, idx_Q, param_gridQ, PT;
+
+--:-:-:-:1 @!P5 MOV  idx_Q, start_Q;
+--:-:-:-:1 @!P5 IADD idx_P, idx_P, param_strideP;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, idx_P, param_gridP, PT;
+--:-:-:-:0      ISETP.LT.AND P5, PT, idx_Q, param_gridQ, P6;
+
+--:-:-:-:1 @!P6 MOV predI, RZ;
+--:-:-:-:1 @!P6 MOV predE, RZ;
+
+</SCHEDULE_BLOCK>
+--:-:-:-:5      RET;
+
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+
+CALC_OFFSETS:
+
+<SCHEDULE_BLOCK>
+// Calc superblock coordinates in m,p,q space
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+
+// Calc this thread's offset within the superblock
+--:-:-:-:1      BFE.U32 super_p, tid7, param_superP;
+--:-:-:-:1      BFE.U32 super_q, tid7, param_superQ;
+
+// Combine offsets for final m,p,q coordinate
+--:-:-:-:1      IADD p, p, super_p;
+--:-:-:-:1      IADD q, q, super_q;
+
+// y = p * str_h - pad_h + (r * dil_h)
+// x = q * str_w - pad_w + (s * dil_w)
+--:-:-:-:1      XMAD  pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD  qs, q,   param_str_w, RZ;
+
+--:-:-:-:1      XMAD  y0, r0,  param_dil_h, pr;
+--:-:-:-:1      XMAD  y1, r1,  param_dil_h, pr;
+--:-:-:-:1      XMAD  y2, r2,  param_dil_h, pr;
+--:-:-:-:1      XMAD  y3, r3,  param_dil_h, pr;
+--:-:-:-:1      IADD  y0, y0, -param_pad_h;
+--:-:-:-:1      IADD  y1, y1, -param_pad_h;
+--:-:-:-:1      IADD  y2, y2, -param_pad_h;
+--:-:-:-:1      IADD  y3, y3, -param_pad_h;
+
+--:-:-:-:1      XMAD  x0, s0,  param_dil_w, qs;
+--:-:-:-:1      XMAD  x1, s1,  param_dil_w, qs;
+--:-:-:-:1      XMAD  x2, s2,  param_dil_w, qs;
+--:-:-:-:1      XMAD  x3, s3,  param_dil_w, qs;
+--:-:-:-:1      IADD  x0, x0, -param_pad_w;
+--:-:-:-:1      IADD  x1, x1, -param_pad_w;
+--:-:-:-:1      IADD  x2, x2, -param_pad_w;
+--:-:-:-:1      IADD  x3, x3, -param_pad_w;
+
+// trackI = c*DHWN + z*HWN + y*WN + x*N + n
+--:-:-:-:1      XMAD.S16.U16.LO2C ti0, y0, param_WN, n;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti1, y1, param_WN, n;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti2, y2, param_WN, n;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti3, y3, param_WN, n;
+--:-:-:-:1      XMAD.S16.U16 ti0, x0, param_N,  ti0;
+--:-:-:-:1      XMAD.S16.U16 ti1, x1, param_N,  ti1;
+--:-:-:-:1      XMAD.S16.U16 ti2, x2, param_N,  ti2;
+--:-:-:-:1      XMAD.S16.U16 ti3, x3, param_N,  ti3;
+--:-:-:-:1      IADD ti0, ti0, czOffset0;
+--:-:-:-:1      IADD ti1, ti1, czOffset1;
+--:-:-:-:1      IADD ti2, ti2, czOffset2;
+--:-:-:-:1      IADD ti3, ti3, czOffset3;
+
+20:-:-:-:1      LEA    track0I0.CC, ti0, param_I[0], [+ dtype_shift() +];
+--:-:-:-:1      ISET.LT.AND    ti0, ti0, RZ, PT;
+--:-:-:-:1      IADD.X track0I1,    ti0, param_I[1];
+--:-:-:-:1      LEA    track1I0.CC, ti1, param_I[0], [+ dtype_shift() +];
+--:-:-:-:1      ISET.LT.AND    ti1, ti1, RZ, PT;
+--:-:-:-:1      IADD.X track1I1,    ti1, param_I[1];
+--:-:-:-:1      LEA    track2I0.CC, ti2, param_I[0], [+ dtype_shift() +];
+--:-:-:-:1      ISET.LT.AND    ti2, ti2, RZ, PT;
+--:-:-:-:1      IADD.X track2I1,    ti2, param_I[1];
+--:-:-:-:1      LEA    track3I0.CC, ti3, param_I[0], [+ dtype_shift() +];
+--:-:-:-:1      ISET.LT.AND    ti3, ti3, RZ, PT;
+--:-:-:-:1      IADD.X track3I1,    ti3, param_I[1];
+
+--:-:-:-:1      SHR.U32 predI, predI, 8;
+--:-:-:-:1      R2P PR, predI, 0x0f;
+--:-:-:-:1      SHL     predI, predI, 4;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, y0, param_H, P0;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_H, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_H, P3;
+--:-:-:-:1      ISETP.GE.AND P0, PT, y0, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, y1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, y2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, y3, RZ, P3;
+--:-:-:-:1      P2R predI, PR, predI, 0x0f;
+--:-:-:-:1      SHL predI, predI, 4;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x0, param_W, P0;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_W, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_W, P3;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x0, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R predI, PR, predI, 0x0f;
+
+// trackE = k*MPQN + m*PQN + p*QN + n
+--:-:-:-:1      XMAD.LO2C te, p, param_QN,   n;
+--:-:-:-:1      XMAD      te, q, param_N,    te;
+--:-:-:-:1      IADD      te, te, kmOffset;
+
+--:-:-:-:1      LEA      track0E0.CC, te, param_E[0],     [+ dtype_shift() +];
+--:-:-:-:1      LEA.HI.X track0E1,    te, param_E[1], RZ, [+ dtype_shift() +];
+--:-:-:-:1      IADD     track1E0.CC, track0E0, param_MPQN16p;
+--:-:-:-:0      IADD.X   track1E1,    track0E1, RZ;
+
+--:-:-:-:1      ISET.LT.AND qIn, p, param_P, PT;
+--:-:-:-:1      ISET.LT.AND pIn, q, param_Q, PT;
+--:-:-:-:1      SHR.U32  predEt, predE, 2;
+--:-:-:-:1      LOP3.LUT predEt, predEt, pIn, qIn, 0x80;
+--:-:-:-:1      BFI predE, predEt, 0x200, predE;
+
+</SCHEDULE_BLOCK>
+--:-:-:-:5      RET;
+
+
+MAIN_LOOP:
+[+
+    our ($vec_size, $convert_in, $largeN);
+    my %insert = (
+
+        j0c8  => "--:-:-:-:1      R2P PR, predI, 0x0f;\n",
+
+        $convert_in ? (
+            j1c5  => "--:-:-:-:1      DEPBAR.LE SB1, 1;\n",
+            j1c8  => "--:-:-:-:1      $convert_in I03, I01.H1;\n",
+            j1c10 => "--:-:-:-:1      $convert_in I02, I01.H0;\n",
+            j1c12 => "--:-:-:-:1      $convert_in I01, I00.H1;\n",
+            j1c14 => "--:-:6:-:1      $convert_in I00, I00.H0;\n",
+
+            j2c5  => "--:-:-:-:1      DEPBAR.LE SB1, 1;\n",
+            j2c8  => "--:-:-:-:1      $convert_in I13, I11.H1;\n",
+            j2c10 => "--:-:-:-:1      $convert_in I12, I11.H0;\n",
+            j2c12 => "--:-:-:-:1      $convert_in I11, I10.H1;\n",
+            j2c14 => "--:-:6:-:1      $convert_in I10, I10.H0;\n",
+
+            j3c5  => "--:-:-:-:1      DEPBAR.LE SB2, 1;\n",
+            j3c8  => "--:-:-:-:1      $convert_in I23, I21.H1;\n",
+            j3c10 => "--:-:-:-:1      $convert_in I22, I21.H0;\n",
+            j3c12 => "--:-:-:-:1      $convert_in I21, I20.H1;\n",
+            j3c14 => "--:-:6:-:1      $convert_in I20, I20.H0;\n",
+
+            j4c5  => "--:-:-:-:1      DEPBAR.LE SB2, 1;\n",
+            j4c8  => "--:-:-:-:1      $convert_in I33, I31.H1;\n",
+            j4c10 => "--:-:-:-:1      $convert_in I32, I31.H0;\n",
+            j4c12 => "--:-:-:-:1      $convert_in I31, I30.H1;\n",
+            j4c14 => "--:-:6:-:1      $convert_in I30, I30.H0;\n",
+
+            j5c8  => "08:-:-:-:1      $convert_in E03, E01.H1;\n",
+            j5c10 => "--:-:-:-:1      $convert_in E02, E01.H0;\n",
+            j5c12 => "--:-:-:-:1      $convert_in E01, E00.H1;\n",
+            j5c14 => "--:-:4:-:1      $convert_in E00, E00.H0;\n",
+
+            j6c8  => "10:-:-:-:1      $convert_in E13, E11.H1;\n",
+            j6c10 => "--:-:-:-:1      $convert_in E12, E11.H0;\n",
+            j6c12 => "--:-:-:-:1      $convert_in E11, E10.H1;\n",
+            j6c14 => "--:-:5:-:1      $convert_in E10, E10.H0;\n",
+        ) : (
+            j1c27 => "--:-:-:-:1      DEPBAR.LE SB1, 1;\n",
+            j2c27 => "--:-:-:-:1      DEPBAR.LE SB1, 1;\n",
+            j2c27 => "--:-:-:-:1      DEPBAR.LE SB2, 1;\n",
+            j4c27 => "--:-:-:-:1      DEPBAR.LE SB2, 1;\n",
+        ),
+
+        j1c30 => "20:-:-:-:1      STS [writeIs + 4x<0*64 + 0*16>], I00;\n",
+        j1c32 => "--:-:-:-:1      STS [writeIs + 4x<1*64 + 0*16>], I01;\n",
+        j1c34 => "--:-:-:-:1      STS [writeIs + 4x<2*64 + 0*16>], I02;\n",
+        j1c36 => "--:6:-:-:1      STS [writeIs + 4x<3*64 + 0*16>], I03;\n",
+        j1c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];\n",
+        j1c60 => "20:-:2:-:1  \@P0 LDG.E.CI.$vec_size I0, [track0I];\n",
+
+        j2c30 => "20:-:-:-:1      STS [writeIs + 4x<0*64 + 1*16>], I10;\n",
+        j2c32 => "--:-:-:-:1      STS [writeIs + 4x<1*64 + 1*16>], I11;\n",
+        j2c34 => "--:-:-:-:1      STS [writeIs + 4x<2*64 + 1*16>], I12;\n",
+        j2c36 => "--:6:-:-:1      STS [writeIs + 4x<3*64 + 1*16>], I13;\n",
+        j2c38 => "--:-:-:-:1 \@!P1 LDS.U.$vec_size I1, [addr_zero];\n",
+        j2c60 => "20:-:2:-:1  \@P1 LDG.E.CI.$vec_size I1, [track1I];\n",
+
+        j3c30 => "20:-:-:-:1      STS [writeIs + 4x<0*64 + 2*16>], I20;\n",
+        j3c32 => "--:-:-:-:1      STS [writeIs + 4x<1*64 + 2*16>], I21;\n",
+        j3c34 => "--:-:-:-:1      STS [writeIs + 4x<2*64 + 2*16>], I22;\n",
+        j3c36 => "--:6:-:-:1      STS [writeIs + 4x<3*64 + 2*16>], I23;\n",
+        j3c38 => "--:-:-:-:1 \@!P2 LDS.U.$vec_size I2, [addr_zero];\n",
+        j3c60 => "20:-:3:-:1  \@P2 LDG.E.CI.$vec_size I2, [track2I];\n",
+
+        j4c30 => "20:-:-:-:1      STS [writeIs + 4x<0*64 + 3*16>], I30;\n",
+        j4c32 => "--:-:-:-:1      STS [writeIs + 4x<1*64 + 3*16>], I31;\n",
+        j4c34 => "--:-:-:-:1      STS [writeIs + 4x<2*64 + 3*16>], I32;\n",
+        j4c36 => "--:6:-:-:1      STS [writeIs + 4x<3*64 + 3*16>], I33;\n",
+        j4c38 => "--:-:-:-:1 \@!P3 LDS.U.$vec_size I3, [addr_zero];\n",
+        j4c60 => "20:-:3:-:1  \@P3 LDG.E.CI.$vec_size I3, [track3I];\n",
+
+        j5c7  => "--:-:-:-:1      R2P PR, predE, 0x0f;\n",
+
+        j5c30 => "08:-:-:-:1      STS [writeEs + 4x<0*32 + 0*16>], E00;\n",
+        j5c32 => "--:-:-:-:1      STS [writeEs + 4x<1*32 + 0*16>], E01;\n",
+        j5c34 => "--:-:-:-:1      STS [writeEs + 4x<2*32 + 0*16>], E02;\n",
+        j5c36 => "--:4:-:-:1      STS [writeEs + 4x<3*32 + 0*16>], E03;\n",
+        j5c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size E0, [addr_zero];\n",
+        j5c60 => "08:-:4:-:1  \@P0 LDG.E.CI.$vec_size E0, [track0E];\n",
+
+        j6c30 => "10:-:-:-:1      STS [writeEs + 4x<0*32 + 1*16>], E10;\n",
+        j6c32 => "--:-:-:-:1      STS [writeEs + 4x<1*32 + 1*16>], E11;\n",
+        j6c34 => "--:-:-:-:1      STS [writeEs + 4x<2*32 + 1*16>], E12;\n",
+        j6c36 => "--:5:-:-:1      STS [writeEs + 4x<3*32 + 1*16>], E13;\n",
+        j6c38 => "--:-:-:-:1 \@!P1 LDS.U.$vec_size E1, [addr_zero];\n",
+        j6c60 => "10:6:5:-:1  \@P1 LDG.E.CI.$vec_size E1, [track1E];\n",
+
+        j6c63 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readIs,  readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readEs,  readEs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeIs, writeIs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeEs, writeEs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c15 => "--:-:-:-:1      PSETP.OR.AND P4, PT, P5, P6, PT;\n",
+        j7c17 => "--:-:-:-:1      IADD n, n, param_loopN;\n",
+        j7c27 => "--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, P4;\n",
+
+        $largeN ? (
+            j7c30 => "20:-:-:-:1      IADD   track0I0.CC, track0I0, param_loopNp;\n",
+            j7c35 => "--:-:-:-:1      IADD.X track0I1,    track0I1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track1I0.CC, track1I0, param_loopNp;\n",
+            j7c40 => "--:-:-:-:1      IADD.X track1I1,    track1I1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track2I0.CC, track2I0, param_loopNp;\n",
+            j7c45 => "--:-:-:-:1      IADD.X track2I1,    track2I1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track3I0.CC, track3I0, param_loopNp;\n",
+            j7c50 => "--:-:-:-:1      IADD.X track3I1,    track3I1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track0E0.CC, track0E0, param_loopNp;\n",
+            j7c55 => "--:-:-:-:1      IADD.X track0E1,    track0E1, RZ;\n" .
+                     "--:-:-:-:1      IADD   track1E0.CC, track1E0, param_loopNp;\n",
+            j7c60 => "--:-:-:-:1      IADD.X track1E1,    track1E1, RZ;\n",
+        ) : (),
+
+        j7c63 => "--:-:-:Y:5  \@P4 BRA.U MAIN_LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) & 7;
+        my $shift    = ((($j + 1) & 7) >> 2) << 2;
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy0, [readIs + 4x<%d*64 + 00 + %d>];\n", $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1      LDS.U.128 j%dEx0, [readEs + 4x<%d*32 + 00 + %d>];\n", $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy4, [readIs + 4x<%d*64 + 32 + %d>];\n", $nOdd, $rsOffset, $shift;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1      LDS.U.128 j%dEx4, [readEs + 4x<%d*32 + 16 + %d>];\n", $nOdd, $rsOffset, $shift;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1;
+
+            my $yield  = $c == 25 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+// Advance x/q offsets+preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD x0, x0, param_loopX;
+--:-:-:-:1      IADD x1, x1, param_loopX;
+--:-:-:-:1      IADD x2, x2, param_loopX;
+--:-:-:-:1      IADD x3, x3, param_loopX;
+20:-:-:-:1      IADD   track0I0.CC, track0I0, param_loopXp;
+--:-:-:-:1      IADD.X track0I1,    track0I1, RZ;
+--:-:-:-:1      IADD   track1I0.CC, track1I0, param_loopXp;
+--:-:-:-:1      IADD.X track1I1,    track1I1, RZ;
+--:-:-:-:1      IADD   track2I0.CC, track2I0, param_loopXp;
+--:-:-:-:1      IADD.X track2I1,    track2I1, RZ;
+--:-:-:-:1      IADD   track3I0.CC, track3I0, param_loopXp;
+--:-:-:-:1      IADD.X track3I1,    track3I1, RZ;
+
+--:-:-:-:1      SHR.U32 predI, predI, 4;
+--:-:-:-:1  @P6 R2P PR, predI, 0x0f;
+--:-:-:-:1      SHL     predI, predI, 4;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x0, param_W, P0;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_W, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_W, P3;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x0, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R predI, PR, predI, 0x0f;
+
+--:-:-:-:1      IADD q, q, param_loopQ;
+--:-:-:-:1      ISETP.LT.AND P4, PT, q, param_Q, PT;
+--:-:-:-:1 @!P4 LOP.AND predE, predE, 0xc;
+
+--:-:-:-:1      IADD   track0E0.CC, track0E0, param_loopQp;
+--:-:-:-:1      IADD.X track0E1,    track0E1, RZ;
+--:-:-:-:1      IADD   track1E0.CC, track1E0, param_loopQp;
+
+--:-:-:-:1      IADD idx_Q, idx_Q, param_strideQ;
+--:-:-:-:1      ISETP.LT.AND P5, PT, idx_Q, param_gridQ, P6;
+
+--:-:-:-:1      LOP.AND n, tid7, param_superN;
+--:-:-:-:1      SHL n, n, 2;
+
+</SCHEDULE_BLOCK>
+--:-:-:-:0      IADD.X track1E1,    track1E1, RZ;
+--:-:-:Y:5  @P5 BRA.U MAIN_LOOP;
+
+// Advance y/p offsets+preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV  idx_Q, start_Q;
+--:-:-:-:1      IADD idx_P, idx_P, param_strideP;
+
+--:-:-:-:1      PSETP.AND.AND P5, PT, PT, PT, PT;
+--:-:-:Y:d      ISETP.LT.AND  P6, PT, idx_P, param_gridP, PT;
+</SCHEDULE_BLOCK>
+--:-:-:Y:5 @!P6 BRA.U FINISH_LOOP;
+--:-:-:-:5      CAL CALC_OFFSETS;
+--:-:-:Y:5  @P6 BRA.U MAIN_LOOP;
+
+// Set n to loop remaining times
+FINISH_LOOP:
+--:-:-:-:1      LOP.AND.NZ P5, RZ, init, 3;
+--:-:-:-:1      MOV predI, RZ;
+--:-:-:-:1      MOV predE, RZ;
+--:-:-:-:1      MOV loopN, param_loopN;
+--:-:-:Y:8      MOV N, param_N;
+--:-:-:-:1      VMAD.U16.U16 n, -init, loopN, N;
+--:-:-:-:0      MOV init, RZ;
+01:-:-:Y:5  @P5 BRA.U MAIN_LOOP;
+
+
+--:-:1:-:2      S2R Tid, SR_TID.X;
+<SCHEDULE_BLOCK>
+01:-:-:-:1      SHR.U32 tid_32, Tid, 5;
+--:-:-:-:1      LOP.AND tid_31, Tid, 31;
+
+// readFs = (tid_32 << 7 + tid_31) << 2
+--:-:-:-:1      ISCADD readFs, tid_32, tid_31, 7;
+--:-:-:-:1      SHL    readFs, readFs, 2;
+
+// kk = idx_K*32 + tid31;
+--:-:-:-:1      ISCADD kk, idx_K, tid_31, 5;
+// kk < K
+--:-:-:-:1      ISETP.LT.AND P4, PT, kk, param_K, PT;
+
+// crst = idx_C*64 + tid_32*4
+--:-:-:-:1      SHL     tid_32, tid_32, 2;
+--:-:-:-:1      ISCADD  crst00, idx_C, tid_32, 6;
+--:-:-:-:1      IADD    crst04, crst00, 16;
+--:-:-:-:1      IADD    crst08, crst00, 32;
+--:-:-:-:1      IADD    crst12, crst00, 48;
+
+--:-:-:-:1      MOV K, param_K;
+--:-:-:-:1      SHL K1,  K, 2;
+--:-:-:-:1      SHL K16, K, 6;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+// trackF += crst*K + k;
+--:-:-:-:1      XMAD.LO2 tf, crst00, K, kk;
+[+
+    our $determ;
+    return $determ ? q{
+// idx_MPQ = idx_M * grid_PQ + idx_P * grid_Q + idx_Q
+// trackF += idx_MPQ * CRSTK
+--:-:-:-:1      XMAD      idx_MPQ, start_P, param_strideQ, start_Q;
+--:-:-:-:1      XMAD.LO2C idx_MPQ, idx_M,  param_stridePQ, idx_MPQ;
+--:-:-:-:1      XMAD.LO   tf, idx_MPQ, param_CTRSK, tf, xmad_determ;
+    } : '';
++]
+--:-:-:-:1      LEA      track00F0.CC, tf, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X track00F1,    tf, param_F[1], RZ, 2;
+--:-:-:-:1      IADD     track04F0.CC, track00F0, K16;
+--:-:-:-:1      IADD.X   track04F1,    track00F1, RZ;
+--:-:-:-:1      IADD     track08F0.CC, track04F0, K16;
+--:-:-:-:1      IADD.X   track08F1,    track04F1, RZ;
+--:-:-:-:1      IADD     track12F0.CC, track08F0, K16;
+--:-:-:-:1      IADD.X   track12F1,    track08F1, RZ;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:1      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:1      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:1      STS.128 [writeFs+4x<0*128 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeFs+4x<0*128 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeFs+4x<1*128 + 00>], shuffle_x0y1;
+--:-:-:-:1      STS.128 [writeFs+4x<1*128 + 16>], shuffle_x4y1;
+--:-:-:-:1      STS.128 [writeFs+4x<2*128 + 00>], shuffle_x0y2;
+--:-:-:-:1      STS.128 [writeFs+4x<2*128 + 16>], shuffle_x4y2;
+--:-:-:-:1      STS.128 [writeFs+4x<3*128 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeFs+4x<3*128 + 16>], shuffle_x4y3;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_F;
+--:-:-:-:0      IADD readFs, readFs, 4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_F;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:0      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:1      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:1      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:1      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:1      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:1      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:1      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:1      STS.128 [writeFs+4x<0*128 + 00>], shuffle_x0y4;
+--:-:-:-:1      STS.128 [writeFs+4x<0*128 + 16>], shuffle_x4y4;
+--:-:-:-:1      STS.128 [writeFs+4x<1*128 + 00>], shuffle_x0y5;
+--:-:-:-:1      STS.128 [writeFs+4x<1*128 + 16>], shuffle_x4y5;
+--:-:-:-:1      STS.128 [writeFs+4x<2*128 + 00>], shuffle_x0y6;
+--:-:-:-:1      STS.128 [writeFs+4x<2*128 + 16>], shuffle_x4y6;
+--:-:-:-:1      STS.128 [writeFs+4x<3*128 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeFs+4x<3*128 + 16>], shuffle_x4y7;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:0      IADD readFs, readFs, -4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_F;
+--:-:-:-:0      IADD readFs, readFs,  4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_F;
+
+--:-:-:-:5      EXIT;
+
+STORE_F:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CTRS, P4; // crst00 < CRST && k < K
+--:-:-:-:1      IADD crst00, crst00, 1;
+--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CTRS, P4; // crst04 < CRST && k < K
+--:-:-:-:1      IADD crst04, crst04, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CTRS, P4; // crst08 < CRST && k < K
+--:-:-:-:1      IADD crst08, crst08, 1;
+--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CTRS, P4; // crst12 < CRST && k < K
+--:-:-:-:1      IADD crst12, crst12, 1;
+<ORDERED>
+--:-:-:-:1      LDS f00_0, [readFs + 4x< 0*128 + 0*32 + 0*16>];
+--:-:-:-:1      LDS f00_1, [readFs + 4x< 0*128 + 1*32 + 0*16>];
+--:-:-:-:1      LDS f00_2, [readFs + 4x< 0*128 + 2*32 + 0*16>];
+--:-:1:Y:1      LDS f00_3, [readFs + 4x< 0*128 + 3*32 + 0*16>];
+--:-:-:-:1      LDS f04_0, [readFs + 4x< 4*128 + 0*32 + 1*16>];
+--:-:-:-:1      LDS f04_1, [readFs + 4x< 4*128 + 1*32 + 1*16>];
+--:-:-:-:1      LDS f04_2, [readFs + 4x< 4*128 + 2*32 + 1*16>];
+--:-:2:Y:1      LDS f04_3, [readFs + 4x< 4*128 + 3*32 + 1*16>];
+--:-:-:-:1      LDS f08_0, [readFs + 4x< 8*128 + 0*32 + 2*16>];
+--:-:-:-:1      LDS f08_1, [readFs + 4x< 8*128 + 1*32 + 2*16>];
+--:-:-:-:1      LDS f08_2, [readFs + 4x< 8*128 + 2*32 + 2*16>];
+--:-:3:Y:1      LDS f08_3, [readFs + 4x< 8*128 + 3*32 + 2*16>];
+--:-:-:-:1      LDS f12_0, [readFs + 4x<12*128 + 0*32 + 3*16>];
+--:-:-:-:1      LDS f12_1, [readFs + 4x<12*128 + 1*32 + 3*16>];
+--:-:-:-:1      LDS f12_2, [readFs + 4x<12*128 + 2*32 + 3*16>];
+--:-:4:Y:1      LDS f12_3, [readFs + 4x<12*128 + 3*32 + 3*16>];
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+01:-:-:-:1      FADD f00_0, f00_0, f00_1;
+--:-:-:-:1      FADD f00_2, f00_2, f00_3;
+02:-:-:-:1      FADD f04_0, f04_0, f04_1;
+--:-:-:-:1      FADD f04_2, f04_2, f04_3;
+04:-:-:-:1      FADD f08_0, f08_0, f08_1;
+--:-:-:-:1      FADD f08_2, f08_2, f08_3;
+08:-:-:-:1      FADD f12_0, f12_0, f12_1;
+--:-:-:-:1      FADD f12_2, f12_2, f12_3;
+
+--:-:-:-:1      FADD f00_0, f00_0, f00_2;
+--:-:-:-:2      FADD f04_0, f04_0, f04_2;
+--:-:-:-:2      FADD f08_0, f08_0, f08_2;
+--:-:-:-:0      FADD f12_0, f12_0, f12_2;
+
+01:1:-:-:1  @P0 [+ output_op() +] [track00F], f00_0;
+02:2:-:-:1  @P1 [+ output_op() +] [track04F], f04_0;
+04:3:-:-:1  @P2 [+ output_op() +] [track08F], f08_0;
+08:4:-:-:1  @P3 [+ output_op() +] [track12F], f12_0;
+
+01:-:-:-:6      IADD   track00F0.CC, track00F0, K1;
+--:-:-:-:1      IADD.X track00F1,    track00F1, RZ;
+02:-:-:-:6      IADD   track04F0.CC, track04F0, K1;
+--:-:-:-:1      IADD.X track04F1,    track04F1, RZ;
+04:-:-:-:6      IADD   track08F0.CC, track08F0, K1;
+--:-:-:-:1      IADD.X track08F1,    track08F1, RZ;
+08:-:-:-:6      IADD   track12F0.CC, track12F0, K1;
+--:-:-:-:0      IADD.X track12F1,    track12F1, RZ;
+
+--:-:-:-:5      RET;
\ No newline at end of file
diff --git a/Kernel/Convolution/Pascal/xconv_direct_xprop_64x32.sass b/Kernel/Convolution/Pascal/xconv_direct_xprop_64x32.sass
new file mode 100644
index 0000000..4720ab8
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_direct_xprop_64x32.sass
@@ -0,0 +1,2477 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+    our ($type, $SN, $N2, $N1);
+    our $LN = !($SN || $N2 || $N1);
+    our $dtype        = $type eq 'h' ?         'U16' : '32';
+    our $convert_in   = $type eq 'h' ? 'F2F.F32.F16' : '';
+    our $convert_out  = $type eq 'h' ? 'F2F.F16.F32' : '';
+    our $vsize        = $type eq 'h' ?          '64' : '128';
+    our $dshift       = $type eq 'h' ?           '1' : '2';
+    our $dsize        = $type eq 'h' ?           '2' : '4';
+    our $slice_scale  = $N1 ? 4 : $N2 ? 3 : 2;
+    our $slice_offset = 1 << $slice_scale;
+    our $slice_load   = 8 << $slice_scale;
+    sub dtype       { return $dtype;       }
+    sub dshift      { return $dshift; }
+    sub vsize       { return $vsize; }
+    our $vsizeI;
+    if ($type eq 'h')
+        { $vsizeI = $N1 ? 'U16' : $N2 ? '32' : '64';  }
+    else
+        { $vsizeI = $N1 ? '32'  : $N2 ? '64' : '128'; }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero  : 4x<(32 + 64)*32*2>
+    addr_szLut : 4x<(32 + 64)*32*2 + 4>
+    addr_lut4  : 4x<(32 + 64)*32*2 + 4>
+    addr_lut   : 4x<(32 + 64)*32*2 + 6>
+
+    szShareF   : (64*32)
+    szShareI   : (32*32)
+
+    param_Sum[0]       : c[0x0][0x140]
+    param_Sum[1]       : c[0x0][0x144]
+    param_X[0]         : c[0x0][0x148]
+    param_X[1]         : c[0x0][0x14c]
+    param_O[0]         : c[0x0][0x150]
+    param_O[1]         : c[0x0][0x154]
+    param_I[0]         : c[0x0][0x158]
+    param_I[1]         : c[0x0][0x15c]
+    param_F[0]         : c[0x0][0x160]
+    param_F[1]         : c[0x0][0x164]
+    param_alpha        : c[0x0][0x168]
+    param_beta         : c[0x0][0x16c]
+    param_flags        : c[0x0][0x170]
+    param_C            : c[0x0][0x174]
+    param_D            : c[0x0][0x178]
+    param_H            : c[0x0][0x17c]
+    param_W            : c[0x0][0x180]
+    param_N            : c[0x0][0x184]
+    param_K            : c[0x0][0x188]
+    param_M            : c[0x0][0x18c]
+    param_P            : c[0x0][0x190]
+    param_Q            : c[0x0][0x194]
+    param_str_d        : c[0x0][0x198]
+    param_str_h        : c[0x0][0x19c]
+    param_str_w        : c[0x0][0x1a0]
+    param_pad_d        : c[0x0][0x1a4]
+    param_pad_h        : c[0x0][0x1a8]
+    param_pad_w        : c[0x0][0x1ac]
+    param_dil_d        : c[0x0][0x1b0] 
+    param_dil_h        : c[0x0][0x1b4] 
+    param_dil_w        : c[0x0][0x1b8] 
+    param_DHWN         : c[0x0][0x1bc]
+    param_HWN          : c[0x0][0x1c0]
+    param_WN           : c[0x0][0x1c4]
+    param_MPQN         : c[0x0][0x1c8]
+    param_PQN          : c[0x0][0x1cc]
+    param_QN           : c[0x0][0x1d0]
+    param_PQnk         : c[0x0][0x1d4]
+    param_Qnk          : c[0x0][0x1d8]
+    param_nk           : c[0x0][0x1dc]
+    param_n            : c[0x0][0x1e0]
+    param_k            : c[0x0][0x1e4]
+    param_magic_PQnk   : c[0x0][0x1e8]
+    param_shift_PQnk   : c[0x0][0x1ec]
+    param_magic_Qnk    : c[0x0][0x1f0]
+    param_shift_Qnk    : c[0x0][0x1f4]
+    param_magic_nk     : c[0x0][0x1f8]
+    param_shift_nk     : c[0x0][0x1fc]
+    param_magic_k      : c[0x0][0x200]
+    param_shift_k      : c[0x0][0x204]
+    param_Km32         : c[0x0][0x208]
+    param_K32p         : c[0x0][0x20c]
+    param_TRSK         : c[0x0][0x210]
+    param_TRS          : c[0x0][0x214]
+    param_RS           : c[0x0][0x218]
+    param_S            : c[0x0][0x21c]
+    param_magic_RS     : c[0x0][0x220]
+    param_shift_RS     : c[0x0][0x224]
+    param_magic_S      : c[0x0][0x228]
+    param_shift_S      : c[0x0][0x22c]
+    param_gridP2       : c[0x0][0x230]
+    param_gridQ        : c[0x0][0x234]
+    param_gridN        : c[0x0][0x238]
+    param_gridQN       : c[0x0][0x23c]
+    param_gridPQN      : c[0x0][0x240]
+    param_gridMPQN     : c[0x0][0x244]
+    param_superM       : c[0x0][0x248]
+    param_superP       : c[0x0][0x24c]
+    param_superQ       : c[0x0][0x250]
+    param_superN       : c[0x0][0x254]
+    param_shiftM       : c[0x0][0x258]
+    param_shiftP       : c[0x0][0x25c]
+    param_shiftQ       : c[0x0][0x260]
+    param_shiftN       : c[0x0][0x264]
+    param_SuperM       : c[0x0][0x268]
+    param_SuperP       : c[0x0][0x26c]
+    param_SuperQ       : c[0x0][0x270]
+    param_SuperN       : c[0x0][0x274]
+    param_magic_str_d  : c[0x0][0x278]
+    param_shift_str_d  : c[0x0][0x27c]
+    param_magic_str_h  : c[0x0][0x280]
+    param_shift_str_h  : c[0x0][0x284]
+    param_magic_str_w  : c[0x0][0x288]
+    param_shift_str_w  : c[0x0][0x28c]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-79 : j0Fy<0-7>, j0Ix<0-7>
+      80-95 : j1Fy<0-7>, j1Ix<0-7>
+
+     96-119 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>, I0<0-3>, I1<0-3>
+    120-131 : track0F<0-1>,  track1F<0-1>, track2F<0-1>,  track3F<0-1>, track0I<0-1>, track1I<0-1>
+
+      64-83 ~ tidY, m, p, q, negOne, trs, lutStore2, lut_size, warp_count, warp_inc, neg_RS, neg_S, dep_thd_mask, qs, pr, mt, neg_str_w, neg_str_h, neg_str_d
+
+     84-131 ~ idx_MPQnk, idx_PQnk, idx_Qnk, idx_nk, idx_n, idx_k, magic_PQnk, magic_Qnk, neg_PQnk, neg_Qnk, neg_nk, neg_k, div1, div2, div3, idx_P2, idx_Q2, super_m, super_p, super_q, super_n, tid1, tid2, tid3, tid7, tid8, tid31, tid32, readIs2, tidX, k<0|1|2|3>, sb, warp_mask, mask_shr, shiftSB, maskSB, q<1|2|3>
+
+     84-131 ~ rs, t, r, s, z, y, x, x<1|2|3>, z_prime, y_prime, x_prime, x_prime<1|2|3>, z_mod, y_mod, x_mod, x_mod<1|2|3>, lutStore, ballot, warp_slices, dep_thd_bits, dep_thd_cnt, tidY1
+
+[+
+    our ($SN, $N2, $N1);
+    return $N1 ? q{
+        132-135 : slice0I<0-3>
+        168-171 : slice1I<0-3>
+        172-183 : track0I<2-3>, track0I<4-5>, track0I<6-7>, track1I<2-3>, track1I<4-5>, track1I<6-7>
+        184-185 ~ predsI
+
+    } : $N2 ? q{
+        132-135 : slice0I<0-1>, slice1I<0-1>
+        168-171 : track0I<2-3>, track1I<2-3>
+
+    } : $SN ? q{
+        132-135 ~ slice0I, slice1I
+
+    } : q{
+        132-133 : sliceI, sliceF
+        132-133 : sliceIF<0-1>
+        132-135 : sliceI0, sliceF0, sliceI1, sliceF1
+        132-135 : slice0IF<0-1>, slice1IF<0-1>
+    };
++]
+
+    136-151 ~ posCTRS, endCTRS, endCTRS32, lutSize, lutSizeRcp, lutSizeM1, posCTRSf, channel, lutOffset0, lutOffset1, offsetIc0, offsetIc1, offsetFc0, offsetFc1, partial
+    152-167 ~ tid, idx_K, idx_M, idx_P, idx_Q, idx_N, k, n, writeFs, writeIs, readFs, readIs, swapBuf, writeOs, preds, sb_offset
+
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+
+      64-95 ~ o00_<0-3>, o04_<0-3>, o08_<0-3>, o12_<0-3>, b<00|04|08|12>, x<00|04|08|12>, bsum<00|04|08|12>
+     96-131 ~ tid_31, tid_32, alpha, readOs, MPQN16, MPQN4, k<00|04|08|12>, offset, one, M, P, Q, N, super_M, super_P, super_Q, super_N, bsum_offset
+        0-7 : Out00_<0-1>, Out04_<0-1>, Out08_<0-1>, Out12_<0-1>
+       8-15 : Sum00_<0-1>, Sum04_<0-1>, Sum08_<0-1>, Sum12_<0-1>
+      16-31 ~ out<00|04|08|12>, sum<00|04|08|12>
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,       SR_TID.X;
+--:-:2:-:1      S2R idx_MPQnk, SR_CTAID.X;
+--:-:3:-:1      S2R idx_K,     SR_CTAID.Y;
+--:-:4:-:1      S2R idx_N,     SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+// tidX = (tid & 7) << 2
+// tidY = tid >> 3 << 1
+01:-:-:-:1      LOP.AND tid7,  tid,  7;
+--:-:-:-:1      SHL     tidX,  tid7, 2;
+--:-:-:-:1      SHR.U32 tid3,  tid,  3;
+--:-:-:-:1      SHL     tidY,  tid3, 1;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// idx_M = idx_MPQnk / blk_PQnk
+--:-:-:-:1      MOV  magic_PQnk, param_magic_PQnk;
+--:-:-:-:1      ISETP.NE.AND P0, PT,   magic_PQnk, 1, PT;
+02:-:-:-:1  @P0 XMAD     div1, idx_MPQnk,    magic_PQnk,    RZ;
+--:-:-:-:1  @P0 XMAD     div2, idx_MPQnk,    magic_PQnk.H1, RZ;
+--:-:-:-:1  @P0 XMAD     div3, idx_MPQnk.H1, magic_PQnk.H1, RZ;
+--:-:-:-:1  @P0 XMAD.CHI div1, idx_MPQnk.H1, magic_PQnk,    div1;
+--:-:-:-:1  @P0 IADD3.RS idx_M, div1, div2, div3;
+--:-:-:-:1  @P0 SHR.U32  idx_M, idx_M,     param_shift_PQnk;
+--:-:-:-:1 @!P0 SHR.U32  idx_M, idx_MPQnk, param_shift_PQnk;
+
+// idx_PQnk = idx_PQnk % blk_Qnk
+--:-:-:-:1      IADD neg_PQnk, RZ, -param_PQnk;
+--:-:-:-:1      XMAD.LO2 idx_PQnk, neg_PQnk, idx_M, idx_MPQnk;
+
+// idx_P2 = idx_PQnk / blk_Qnk
+--:-:-:-:1      MOV  magic_Qnk, param_magic_Qnk;
+--:-:-:-:1      ISETP.NE.AND P1, PT,  magic_Qnk, 1, PT;
+--:-:-:-:1  @P1 XMAD     div1, idx_PQnk,    magic_Qnk,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, idx_PQnk,    magic_Qnk.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, idx_PQnk.H1, magic_Qnk.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, idx_PQnk.H1, magic_Qnk,    div1;
+--:-:-:-:1  @P1 IADD3.RS idx_P2, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  idx_P2, idx_P2,   param_shift_Qnk;
+--:-:-:-:1 @!P1 SHR.U32  idx_P2, idx_PQnk, param_shift_Qnk;
+
+// idx_Qnk = idx_PQnk % blk_Qnk
+--:-:-:-:1      IADD neg_Qnk, RZ, -param_Qnk;
+--:-:-:-:1      XMAD.LO2 idx_Qnk, neg_Qnk, idx_P2, idx_PQnk;
+
+// idx_Q2  = idx_Qnk / nk
+--:-:-:-:1      XMAD.LO2C idx_Q2, idx_Qnk, param_magic_nk, RZ;
+--:-:-:-:1      SHR.U32   idx_Q2, idx_Q2,   param_shift_nk;
+// idx_nk = idx_Qnk % nk
+--:-:-:-:1      IADD neg_nk, RZ, -param_nk;
+--:-:-:-:1      XMAD.S16.U16  idx_nk, neg_nk, idx_Q2, idx_Qnk;
+
+// idx_n = idx_nk / k
+--:-:-:-:1      XMAD    idx_n,  idx_nk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_n,  idx_n,  param_shift_k;
+// idx_k = idx_nk % k
+--:-:-:-:1      IADD neg_k, RZ, -param_k;
+--:-:-:-:1      XMAD.S16.U16 idx_k, neg_k, idx_n, idx_nk;
+
+// idx_N = idx_N * blk_n + idx_n
+// idx_K = idx_K * blk_k + idx_k
+08:-:-:-:1      XMAD idx_N, idx_N, param_n, idx_n;
+04:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+--:-:-:-:1      ISCADD k, idx_K, tidX, 6;
+
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// idx_P = idx_P2 * 2
+// idx_Q = idx_Q2
+// if idx_P2 != gridP2:
+//     idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1)
+//     idx_Q  = idx_Q2 >> 1
+--:-:-:-:1      ISETP.NE.AND P1, PT, idx_P2, param_gridP2, PT;
+--:-:-:-:1      SHL idx_P, idx_P2, 1;
+--:-:-:-:1  @P1 LOP.AND q1, idx_Q2, 1;
+--:-:-:-:1  @P1 BFE.U32 q2, idx_Q2, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P1 LOP.XOR q1, q1, q2;
+--:-:-:-:1  @P1 IADD idx_P, idx_P, q1;
+--:-:-:-:1  @P1 SHR.U32 idx_Q, idx_Q2, 1;
+--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2;
+
+// Scan backwards on odd rows
+// if idx_P2 & 1:
+//     idx_Q = gridQ - idx_Q - 1
+--:-:-:-:1      LOP.AND.NZ P0, RZ, idx_P2, 1;
+--:-:-:-:1      MOV negOne, -1;
+--:-:-:-:1  @P0 IADD3 idx_Q, -idx_Q, param_gridQ, negOne;
+
+// writeFs = (tidY*64 + tidX) * 4
+--:-:-:-:1      ISCADD writeFs, tidY, tidX, 6;
+--:-:-:-:1      SHL    writeFs, writeFs, 2;
+
+// writeIs = (tidY*32 + tidX) * 4
+--:-:-:-:1      ISCADD writeIs, tidY, tidX, 5;
+--:-:-:-:1      ISCADD writeIs, writeIs, 4x<szShareF>, 2;
+
+
+// readIs  = (((tid >> 1) & 3) << 4
+--:-:-:-:1      BFE.U32 readIs, tid, 0x201; // 2 bits at position 1
+
+// readFs = (((tid & 24) >> 2) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,   24;
+--:-:-:-:1      SHR.U32 readFs, readFs, 2;
+--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
+
+// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5)
+// tid32 = tid & -32
+--:-:-:-:1      LOP.AND tid32, tid, -32;
+
+// readIs2 = readIs + (tid32 >> 2) + (readFs << 2)
+--:-:-:-:1      SHR.U32 readIs2, tid32, 2;
+--:-:-:-:1      IADD    readIs2, readIs2, readIs;
+--:-:-:-:1      ISCADD  readIs2, readFs, readIs2, 2;
+
+--:-:-:-:1      SHL readFs,  readFs,  4;
+--:-:-:-:1      SHL readIs,  readIs,  4;
+--:-:-:-:1      SHL readIs2, readIs2, 4;
+
+// writeFs = readFs*32*4 + readIs2
+--:-:-:-:1      ISCADD writeOs, readFs, readIs2, 7;
+
+// Each block of 32 threads works on 8 lines,
+// readFs += tid32/4 * 64 * 4
+// readIs += tid32/4 * 32 * 4 + 4x<szShareF>
+--:-:-:-:1      ISCADD readFs, tid32,  readFs, 6;
+--:-:-:-:1      ISCADD readIs, tid32,  readIs, 5;
+--:-:-:-:1      IADD   readIs, readIs, 4x<szShareF>;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareF + szShareI>;
+
+[+
+    our $K1;
+    return $K1 ? q{
+--:-:-:-:1      IADD k0, k, 32;
+--:-:-:-:1      IADD k1, k, 33;
+--:-:-:-:1      IADD k2, k, 34;
+--:-:-:-:1      IADD k3, k, 35;
+--:-:-:-:1      ISETP.LT.AND P0, PT, k0, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, param_K, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+--:-:-:-:1      SHL preds, preds, 4;
+
+--:-:-:-:1      IADD k1, k, 1;
+--:-:-:-:1      IADD k2, k, 2;
+--:-:-:-:1      IADD k3, k, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, k,  param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, param_K, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, param_K, PT;
+--:-:-:-:1      P2R preds, PR, preds, 0x0f;
+    } : '';
++]
+
+[+
+    our ($SN, $N2, $N1);
+    return $N1 ? q{
+--:-:-:-:1      SHL m, idx_M, param_shiftM;
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+
+--:-:-:-:1      BFE.U32 super_m, tid7, param_superM;
+--:-:-:-:1      BFE.U32 super_p, tid7, param_superP;
+--:-:-:-:1      BFE.U32 super_q, tid7, param_superQ;
+
+--:-:-:-:1      IADD m, m, super_m;
+--:-:-:-:1      IADD p, p, super_p;
+--:-:-:-:1      ISCADD  q, super_q, q, 2;
+--:-:-:-:1      IADD q1, q, 1;
+--:-:-:-:1      IADD q2, q, 2;
+--:-:-:-:1      IADD q3, q, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, m,  param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, p,  param_P, P4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, q,  param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, q1, param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, q2, param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, q3, param_Q, P4;
+--:-:-:-:1      P2R predsI, PR, RZ, 0x0f;
+
+// warp_count = 16
+// warp_inc = 16
+// trs = tid3
+--:-:-:-:1      MOV warp_count, 16;
+--:-:-:-:1      MOV warp_inc,   16;
+--:-:-:-:1      MOV trs, tid3;
+// compute shared memory super-block offset into the lookup table
+// sb_offset = tid7 * TRS * 4 * 4
+--:-:-:-:1      XMAD sb_offset, tid7, param_TRS, RZ;
+--:-:-:-:1      SHL  sb_offset, sb_offset, 4;
+
+    } : $N2 ? q{
+
+--:-:-:-:1      SHL m, idx_M, param_shiftM;
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+
+--:-:-:-:1      BFE.U32 super_m, tid7, param_superM;
+--:-:-:-:1      BFE.U32 super_p, tid7, param_superP;
+--:-:-:-:1      BFE.U32 super_q, tid7, param_superQ;
+
+--:-:-:-:1      IADD m, m, super_m;
+--:-:-:-:1      IADD p, p, super_p;
+--:-:-:-:1      ISCADD  q, super_q, q, 1;
+--:-:-:-:1      IADD q1, q, 1;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, m,  param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, p,  param_P, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, q,  param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, q1, param_Q, P4;
+
+// warp_count = 16
+// warp_inc = 16
+// trs = tid3
+--:-:-:-:1      MOV warp_count, 16;
+--:-:-:-:1      MOV warp_inc,   16;
+--:-:-:-:1      MOV trs, tid3;
+// compute shared memory super-block offset into the lookup table
+// sb_offset = tid7 * TRS * 4 * 2
+--:-:-:-:1      XMAD sb_offset, tid7, param_TRS, RZ;
+--:-:-:-:1      SHL  sb_offset, sb_offset, 3;
+
+    } : $SN ? q{
+--:-:-:-:1      SHL m, idx_M, param_shiftM;
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+--:-:-:-:1      SHL n, idx_N, param_shiftN;
+
+--:-:-:-:1      BFE.U32 super_m, tid7, param_superM;
+--:-:-:-:1      BFE.U32 super_p, tid7, param_superP;
+--:-:-:-:1      BFE.U32 super_q, tid7, param_superQ;
+--:-:-:-:1      LOP.AND super_n, tid7, param_superN;
+
+--:-:-:-:1      IADD m, m, super_m;
+--:-:-:-:1      IADD p, p, super_p;
+--:-:-:-:1      IADD q, q, super_q;
+--:-:-:-:1      ISCADD  n, super_n, n, 2;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, m, param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, p, param_P, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, q, param_Q, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, n, param_N, P0;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P2;
+
+// sb = tid7 >> (shiftN - 2): 0-1,0-3,0-7
+--:-:-:-:1      MOV  shiftSB, param_shiftN;
+--:-:-:-:1      IADD shiftSB, shiftSB, -2;
+--:-:-:-:1      SHR.U32 sb, tid7, shiftSB;
+// warp_count = 4 << shiftN:  64,32,16
+--:-:-:-:1      MOV warp_count, 4;
+--:-:-:-:1      SHL warp_count, warp_count, param_shiftN;
+--:-:-:-:1      MOV warp_inc,   warp_count;
+// maskSB = (1 << shiftSB) - 1: 3,1,0
+--:-:-:-:1      MOV  maskSB, 1;
+--:-:-:-:1      SHL  maskSB, maskSB, shiftSB;
+--:-:-:-:1      IADD maskSB, maskSB, -1;
+// trs = tid3 << shiftSB + (tid7 & mask)
+--:-:-:-:1      LOP.AND maskSB, tid7, maskSB;
+--:-:-:-:1      SHL  trs, tid3, shiftSB;
+--:-:-:-:1      IADD trs, trs,  maskSB;
+// compute shared memory super-block offset into the lookup table
+// sb_offset = sb * TRS * 4
+--:-:-:-:1      XMAD sb_offset, sb, param_TRS, RZ;
+--:-:-:-:1      SHL  sb_offset, sb_offset, 2;
+
+    } : q{
+--:-:-:-:1      SHL n, idx_N, 5;
+--:-:-:-:1      ISCADD n, tid7, n, 2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+--:-:-:-:1      MOV trs,        tid;
+--:-:-:-:1      MOV lutStore2,  RZ;
+--:-:-:-:1      MOV lut_size,   RZ;
+--:-:-:-:1      MOV warp_count, 32;
+--:-:-:-:1      MOV warp_inc,   32;
+
+--:-:-:-:1      IADD    mask_shr, -tid, 32;
+--:-:-:-:1      SHR.U32 dep_thd_mask, negOne, mask_shr;
+
+--:-:-:-:1      ISETP.GE.AND P6, PT, tid, 32, PT;
+
+    };
++]
+--:-:-:-:1      IADD neg_RS, RZ, -param_RS;
+--:-:-:-:1      IADD neg_S,  RZ, -param_S;
+
+[+
+    our ($LN, $prop);
+    my ($m, $p, $q) = $LN ? qw(idx_M idx_P idx_Q) : qw(m p q);
+    return $prop eq 'f' ? qq{
+// mt = m * str_d - pad_d
+// pr = p * str_h - pad_h
+// qs = q * str_w - pad_w
+--:-:-:-:1      XMAD mt, $m,  param_str_d, RZ;
+--:-:-:-:1      XMAD pr, $p,  param_str_h, RZ;
+--:-:-:-:1      XMAD qs, $q,  param_str_w, RZ;
+--:-:-:-:1      IADD mt, mt, -param_pad_d;
+--:-:-:-:1      IADD pr, pr, -param_pad_h;
+--:-:-:-:1      IADD qs, qs, -param_pad_w;
+    } : qq{
+// mt = m - pad_d
+// pr = p - pad_h
+// qs = q - pad_w
+--:-:-:-:1      IADD mt, $m, -param_pad_d;
+--:-:-:-:1      IADD pr, $p, -param_pad_h;
+--:-:-:-:1      IADD qs, $q, -param_pad_w;
+
+--:-:-:-:1      IADD neg_str_d, RZ, -param_str_d;
+--:-:-:-:1      IADD neg_str_h, RZ, -param_str_h;
+--:-:-:-:1      IADD neg_str_w, RZ, -param_str_w;
+    };
++]
+</SCHEDULE_BLOCK>
+
+[+
+    our $LN; return $LN ? q{
+--:-:-:-:5  @P6 BRA.U END_SETUP;
+    } : '';
++]
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_count < RST
+--:-:-:-:1      ISETP.LT.AND P6, PT, warp_count, param_TRS, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, trs, param_TRS, PT;
+
+--:-:-:-:1      IADD warp_count, warp_count, warp_inc;
+// t =  trs / RS
+// rs = trs % RS
+--:-:-:-:1      XMAD.U16.U16 t, trs, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32      t,   t, param_shift_RS;
+--:-:-:-:1      XMAD.U16.S16 rs,  t, neg_RS, trs;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.U16.U16 r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32      r,  r, param_shift_S;
+--:-:-:-:1      XMAD.U16.S16 s,  r, neg_S, rs;
+
+[+
+    our ($SN, $N2, $N1, $prop);
+    if ($prop eq 'f')
+    {
+        return $N1 ? q{
+// x = qs + (s * dil_w)
+// y = pr + (r * dil_h)
+// z = mt + (t * dil_d)
+--:-:-:-:1      XMAD z,  t,  param_dil_d, mt;
+--:-:-:-:1      XMAD y,  r,  param_dil_h, pr;
+--:-:-:-:1      XMAD x,  s,  param_dil_w, qs;
+--:-:-:-:1      IADD x1, x,  param_str_w;
+--:-:-:-:1      IADD x2, x1, param_str_w;
+--:-:-:-:1      IADD x3, x2, param_str_w;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y, RZ, PT;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P5;
+--:-:-:-:1  @P4 R2P PR, predsI, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+--:-:-:-:1      ISETP.GE.AND  P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND  P3, PT, x3, RZ, P3;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, x,  param_W, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x2, param_W, P2;
+--:-:-:-:1      ISETP.LT.AND  P3, PT, x3, param_W, P3;
+
+// sliceI = z*HWN + y*WN + x
+01:-:-:-:1      XMAD.LO2C slice0I0, z, param_HWN, x;
+--:-:-:-:1      XMAD.LO2C slice0I0, y, param_WN,  slice0I0;
+--:-:-:-:1      IADD slice0I1, slice0I0, param_str_w;
+--:-:-:-:1      IADD slice0I2, slice0I1, param_str_w;
+--:-:-:-:1      IADD slice0I3, slice0I2, param_str_w;
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I0, -1;
+--:-:-:-:1 @!P1 MOV slice0I1, -1;
+--:-:-:-:1 @!P2 MOV slice0I2, -1;
+--:-:-:-:1 @!P3 MOV slice0I3, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 4;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+--:1:-:-:1  @P5 STS.128 [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : $N2 ? q{
+
+--:-:-:-:1      XMAD z,  t, param_dil_d, mt;
+--:-:-:-:1      XMAD y,  r, param_dil_h, pr;
+--:-:-:-:1      XMAD x,  s, param_dil_w, qs;
+--:-:-:-:1      IADD x1, x, param_str_w;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y, RZ, P5;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P3;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, x1, RZ, P4;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, x,  param_W, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, x1, param_W, P1;
+
+// sliceI = z*HWN + y*WN + x*2
+01:-:-:-:1      XMAD.LO2C slice0I0, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I0, y, param_WN,  slice0I0;
+--:-:-:-:1      ISCADD slice0I1, x1, slice0I0, 1;
+--:-:-:-:1      ISCADD slice0I0, x,  slice0I0, 1;
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I0, -1;
+--:-:-:-:1 @!P1 MOV slice0I1, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 3;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+--:1:-:-:1  @P5 STS.64 [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : $SN ? q{
+
+--:-:-:-:1      XMAD z, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x, s, param_dil_w, qs;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x, RZ, P5;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x, param_W, P2;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD.LO2C slice0I, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I, y, param_WN,  slice0I;
+--:-:-:-:1      XMAD      slice0I, x, param_N,   slice0I;
+
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 2;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+
+--:1:-:-:1  @P5 STS [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : q{
+
+--:-:-:-:1      XMAD z, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x, s, param_dil_w, qs;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x, RZ, P5;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x, param_W, P2;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+<ORDERED>
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD      sliceI, x, param_N,   sliceI;
+// sliceF = trs * K
+--:-:-:-:1      XMAD sliceF, trs, param_K, RZ;
+</ORDERED>
+
+<ORDERED>
+// Get a mask of all valid slices in the warp
+--:-:-:-:1      VOTE.ANY ballot, PT, P0;
+// Count the total valid slices
+--:-:2:-:1      POPC warp_slices, ballot;
+// Prepare lutStore for this and next loop
+--:-:-:-:1  @P0 MOV    lutStore, lutStore2;
+02:-:-:-:1      ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+--:-:-:-:1  @P0 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+--:-:3:-:1  @P0 POPC dep_thd_cnt, dep_thd_bits;
+// use the trs increment to space the barrier sync
+--:-:-:-:1      IADD trs, trs, warp_inc;
+// Update the lutStore address from this count
+04:-:-:-:1  @P0 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+--:1:-:-:1  @P0 STS.64 [lutStore + addr_lut], sliceIF;
+</ORDERED>
+// Keep track of the total size of the lut
+--:-:-:-:1      IADD lut_size, lut_size, warp_slices;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+--:1:-:-:2      STS [addr_szLut], lut_size;
+        };
+    }
+    else  # bprop
+    {
+        return $N1 ? q{
+
+// x_prime = qs + s
+// y_prime = pr + r
+// z_prime = mt + t
+--:-:-:-:1      XMAD z_prime, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y_prime, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x_prime, s, param_dil_w, qs;
+--:-:-:-:1      IADD3 x_prime1, qs, 1, s;
+--:-:-:-:1      IADD3 x_prime2, qs, 2, s;
+--:-:-:-:1      IADD3 x_prime3, qs, 3, s;
+
+// z     = z_prime / str_d
+// z_mod = z_prime % str_d
+--:-:-:-:1      XMAD    z, z_prime, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z, z,       param_shift_str_d;
+--:-:-:-:1      XMAD.U16.S16 z_mod, z, neg_str_d, z_prime;
+// y     = y_prime / str_h
+// y_mod = y_prime % str_h
+--:-:-:-:1      XMAD    y, y_prime, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y, y,       param_shift_str_h;
+--:-:-:-:1      XMAD.U16.S16 y_mod, y, neg_str_h, y_prime;
+// x     = x_prime / str_w
+// x_mod = x_prime % str_w
+--:-:-:-:1      XMAD    x, x_prime, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x, x,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod, x, neg_str_w, x_prime;
+
+--:-:-:-:1      XMAD    x1, x_prime1, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x1, x1,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod1, x1, neg_str_w, x_prime1;
+
+--:-:-:-:1      XMAD    x2, x_prime2, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x2, x2,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod2, x2, neg_str_w, x_prime2;
+
+--:-:-:-:1      XMAD    x3, x_prime3, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x3, x3,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod3, x3, neg_str_w, x_prime3;
+
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z_prime, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y_prime, RZ, PT;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, z_mod, RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, y_mod, RZ, P1;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P5;
+--:-:-:-:1  @P4 R2P PR, predsI, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,     0x0f;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, x_prime,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, x_prime1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x_prime2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND  P3, PT, x_prime3, RZ, P3;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, x,  param_W, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x2, param_W, P2;
+--:-:-:-:1      ISETP.LT.AND  P3, PT, x3, param_W, P3;
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, x_mod,  RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, x_mod1, RZ, P1;
+--:-:-:-:1      ISETP.EQ.AND  P2, PT, x_mod2, RZ, P2;
+--:-:-:-:1      ISETP.EQ.AND  P3, PT, x_mod3, RZ, P3;
+
+// sliceI = z*HWN + y*WN + x
+01:-:-:-:1      XMAD.LO2C slice0I0, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I0, y, param_WN,  slice0I0;
+--:-:-:-:1      IADD slice0I1, slice0I0, x1;
+--:-:-:-:1      IADD slice0I2, slice0I0, x2;
+--:-:-:-:1      IADD slice0I3, slice0I0, x3;
+--:-:-:-:1      IADD slice0I0, slice0I0, x;
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I0, -1;
+--:-:-:-:1 @!P1 MOV slice0I1, -1;
+--:-:-:-:1 @!P2 MOV slice0I2, -1;
+--:-:-:-:1 @!P3 MOV slice0I3, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 4;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+--:1:-:-:1  @P5 STS.128 [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : $N2 ? q{
+
+// x_prime = qs + s
+// y_prime = pr + r
+// z_prime = mt + t
+--:-:-:-:1      XMAD  z_prime, t, param_dil_d, mt;
+--:-:-:-:1      XMAD  y_prime, r, param_dil_h, pr;
+--:-:-:-:1      XMAD  x_prime, s, param_dil_w, qs;
+--:-:-:-:1      IADD3 x_prime1, qs, 1, s;
+--:-:-:-:1      IADD3 x_prime2, qs, 2, s;
+--:-:-:-:1      IADD3 x_prime3, qs, 3, s;
+
+// z     = z_prime / str_d
+// z_mod = z_prime % str_d
+--:-:-:-:1      XMAD    z, z_prime, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z, z,       param_shift_str_d;
+--:-:-:-:1      XMAD.U16.S16 z_mod, z, neg_str_d, z_prime;
+// y     = y_prime / str_h
+// y_mod = y_prime % str_h
+--:-:-:-:1      XMAD    y, y_prime, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y, y,       param_shift_str_h;
+--:-:-:-:1      XMAD.U16.S16 y_mod, y, neg_str_h, y_prime;
+// x     = x_prime / str_w
+// x_mod = x_prime % str_w
+--:-:-:-:1      XMAD    x, x_prime, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x, x,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod, x, neg_str_w, x_prime;
+
+--:-:-:-:1      XMAD    x1, x_prime1, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x1, x1,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_mod1, x1, neg_str_w, x_prime1;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z_prime, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y_prime, RZ, PT;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, z_mod, RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, y_mod, RZ, P1;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P3;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, x_prime,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, x_prime1, RZ, P4;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, x,  param_W, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, x1, param_W, P1;
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, x_mod,  RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, x_mod1, RZ, P1;
+
+// sliceI = z*HWN + y*WN + x*2
+01:-:-:-:1      XMAD.LO2C slice0I0, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I0, y, param_WN,  slice0I0;
+--:-:-:-:1      ISCADD slice0I1, x1, slice0I0, 1;
+--:-:-:-:1      ISCADD slice0I0, x,  slice0I0, 1;
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I0, -1;
+--:-:-:-:1 @!P1 MOV slice0I1, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 3;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+--:1:-:-:1  @P5 STS.64 [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : $SN ? q{
+// x_prime = qs + s
+// y_prime = pr + r
+// z_prime = mt + t
+--:-:-:-:1      XMAD z_prime, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y_prime, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x_prime, s, param_dil_w, qs;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z_prime, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y_prime, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x_prime, RZ, P5;
+
+// z       = z_prime / str_d
+// z_prime = z_prime % str_d
+--:-:-:-:1      XMAD    z, z_prime, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z, z,       param_shift_str_d;
+--:-:-:-:1      XMAD.U16.S16 z_prime, z, neg_str_d, z_prime;
+// y       = y_prime / str_h
+// y_prime = y_prime % str_h
+--:-:-:-:1      XMAD    y, y_prime, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y, y,       param_shift_str_h;
+--:-:-:-:1      XMAD.U16.S16 y_prime, y, neg_str_h, y_prime;
+// x       = x_prime / str_w
+// x_prime = x_prime % str_w
+--:-:-:-:1      XMAD    x, x_prime, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x, x,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_prime, x, neg_str_w, x_prime;
+
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, z_prime, RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, y_prime, RZ, P1;
+--:-:-:-:1      ISETP.EQ.AND  P2, PT, x_prime, RZ, P2;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x, param_W, P2;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD.LO2C slice0I, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C slice0I, y, param_WN,  slice0I;
+--:-:-:-:1      XMAD      slice0I, x, param_N,   slice0I;
+
+<ORDERED>
+--:-:-:-:1 @!P0 MOV slice0I, -1;
+--:-:-:-:1      ISCADD lutStore, trs, sb_offset, 2;
+--:-:-:-:1      IADD trs, trs, warp_inc;
+</ORDERED>
+
+--:1:-:-:1  @P5 STS [lutStore + addr_lut4], slice0I;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+        } : q{
+// x_prime = qs + s
+// y_prime = pr + r
+// z_prime = mt + t
+--:-:-:-:1      XMAD z_prime, t, param_dil_d, mt;
+--:-:-:-:1      XMAD y_prime, r, param_dil_h, pr;
+--:-:-:-:1      XMAD x_prime, s, param_dil_w, qs;
+
+--:-:-:-:1      ISETP.GE.AND  P0, PT, z_prime, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P1, PT, y_prime, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND  P2, PT, x_prime, RZ, P5;
+
+// z       = z_prime / str_d
+// z_prime = z_prime % str_d
+--:-:-:-:1      XMAD    z, z_prime, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z, z,       param_shift_str_d;
+--:-:-:-:1      XMAD.U16.S16 z_prime, z, neg_str_d, z_prime;
+// y       = y_prime / str_h
+// y_prime = y_prime % str_h
+--:-:-:-:1      XMAD    y, y_prime, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y, y,       param_shift_str_h;
+--:-:-:-:1      XMAD.U16.S16 y_prime, y, neg_str_h, y_prime;
+// x       = x_prime / str_w
+// x_prime = x_prime % str_w
+--:-:-:-:1      XMAD    x, x_prime, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x, x,       param_shift_str_w;
+--:-:-:-:1      XMAD.U16.S16 x_prime, x, neg_str_w, x_prime;
+
+--:-:-:-:1      ISETP.EQ.AND  P0, PT, z_prime, RZ, P0;
+--:-:-:-:1      ISETP.EQ.AND  P1, PT, y_prime, RZ, P1;
+--:-:-:-:1      ISETP.EQ.AND  P2, PT, x_prime, RZ, P2;
+--:-:-:-:1      ISETP.LT.AND  P0, PT, z, param_D, P0;
+--:-:-:-:1      ISETP.LT.AND  P1, PT, y, param_H, P1;
+--:-:-:-:1      ISETP.LT.AND  P2, PT, x, param_W, P2;
+--:-:-:-:1      PSETP.AND.AND P0, PT, P0, P1, P2;
+
+<ORDERED>
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD      sliceI, x, param_N,   sliceI;
+// sliceF = trs * K
+--:-:-:-:1      XMAD sliceF, trs, param_K, RZ;
+</ORDERED>
+
+<ORDERED>
+// Get a mask of all valid slices in the warp
+--:-:-:-:1      VOTE.ANY ballot, PT, P0;
+// Count the total valid slices
+--:-:2:-:1      POPC warp_slices, ballot;
+// Prepare lutStore for this and next loop
+--:-:-:-:1  @P0 MOV    lutStore, lutStore2;
+02:-:-:-:1      ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+--:-:-:-:1  @P0 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+--:-:3:-:1  @P0 POPC dep_thd_cnt, dep_thd_bits;
+// use the trs increment to space the barrier sync
+--:-:-:-:1      IADD trs, trs, warp_inc;
+// Update the lutStore address from this count
+04:-:-:-:1  @P0 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+--:1:-:-:1  @P0 STS.64 [lutStore + addr_lut], sliceIF;
+</ORDERED>
+// Keep track of the total size of the lut
+--:-:-:-:1      IADD lut_size, lut_size, warp_slices;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+--:1:-:-:2      STS [addr_szLut], lut_size;
+        };
+    }
++]
+
+END_SETUP:
+
+01:-:-:-:5      BAR.SYNC 0;
+
+// Grab the caclulated lut size and get it's reciprical
+// Get the total reduction depth
+[+
+    our $LN; return $LN ? q{
+--:-:1:-:2      LDS lutSize, [addr_szLut];
+    } : q{
+--:-:-:-:6      MOV lutSize, param_TRS;
+    };
++]
+01:-:-:-:0      XMAD endCTRS, lutSize, param_C, RZ;
+--:-:1:-:2      I2F.F32.S32 lutSizeRcp, lutSize;
+--:-:-:-:0      IADD lutSizeM1, lutSize, -1;
+01:-:1:-:1      MUFU.RCP lutSizeRcp, lutSizeRcp;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD endCTRS32, endCTRS, 32;
+// posCTRS = tidY
+//--:-:-:-:1      MOV posCTRS, tidY;
+// If this value is not a multiple of 32 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 32 then make a full 32 line fetch.
+--:-:-:-:1      LOP.AND.Z P5, partial, endCTRS, 31;
+--:-:-:-:1  @P5 MOV partial, 32;
+// channel = posCTRS / lutSize
+// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
+--:-:2:-:1      I2F.F32.S32 posCTRSf, tidY;
+03:-:-:-:1      FMUL channel, posCTRSf, lutSizeRcp;
+--:-:-:-:1      FFMA channel, channel, 5.9604644775390625e-08, channel;
+--:-:2:-:1      F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCTRS % lutSize) * 8
+02:-:-:-:1      VMAD.U16.U16 lutOffset0, -channel, lutSize, tidY;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;
+
+// posCTRS = tidY + partial
+--:-:-:-:1      IADD posCTRS, tidY, partial;
+--:-:-:-:1      IADD tidY1, tidY, 1;
+[+
+    our ($N1, $N2, $LN, $dshift, $slice_scale, $slice_offset, $slice_load);
+    return $LN ? q{
+// P5 = tidY < partial && lutSize != 0
+--:-:-:-:1      LOP.AND.NZ P6, RZ, lutSize, -1;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY,  partial, P6;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY1, partial, P6;
+
+--:-:-:-:1      SHL lutOffset0, lutOffset0, 3;
+
+// offsetFC = channel * KRST
+// offsetIC = channel * DHWN
+--:-:-:-:1      XMAD.LO2C offsetIc0, channel, param_DHWN, RZ;
+--:-:-:-:1      XMAD      offsetFc0, channel, param_TRSK, RZ;
+
+--:-:-:-:1  @P0 IADD lutOffset1, lutOffset0, 8;
+--:-:-:-:1  @P0 MOV  offsetFc1, offsetFc0;
+--:-:-:-:1  @P0 MOV  offsetIc1, offsetIc0;
+--:-:-:-:1 @!P0 MOV  lutOffset1, RZ;
+--:-:-:-:1 @!P0 IADD offsetFc1, offsetFc0, param_TRSK;
+--:-:-:-:1 @!P0 IADD offsetIc1, offsetIc0, param_DHWN;
+
+--:-:5:-:1  @P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut];
+--:-:6:-:1  @P6 LDS.U.64 slice1IF, [lutOffset1 + addr_lut];
+    } : qq{
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY,  partial, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY1, partial, PT;
+
+--:-:-:-:1      XMAD offsetFc0, tidY,  param_K, RZ;
+--:-:-:-:1      XMAD offsetFc1, tidY1, param_K, RZ;
+
+--:-:-:-:1      XMAD partial, partial,  param_K, RZ;
+--:-:-:-:1      SHL partial, partial, $dshift;
+
+--:-:-:-:1      ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale;
+--:-:-:-:1      XMAD.LO2C offsetIc0, channel, param_DHWN, RZ;
+
+--:-:-:-:1  \@P0 IADD lutOffset1, lutOffset0, $slice_offset;
+--:-:-:-:1  \@P0 MOV  offsetIc1, offsetIc0;
+--:-:-:-:1 \@!P0 MOV  lutOffset1, sb_offset;
+--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;
+
+--:-:5:-:1  \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4];
+--:-:6:-:1  \@P6 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4];
+    };
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    our ($LN, $dshift);
+    return $LN ? qq{
+10:-:-:-:1      IADD3 offsetFc0, offsetFc0, sliceF0, k;
+--:-:-:-:1      LEA      track0F0.CC, offsetFc0, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0F1,    offsetFc0, param_F[1], RZ, $dshift;
+
+20:-:-:-:1      IADD3 offsetFc1, offsetFc1, sliceF1, k;
+--:-:-:-:1      LEA      track1F0.CC, offsetFc1, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1F1,    offsetFc1, param_F[1], RZ, $dshift;
+    } : qq{
+--:-:-:-:1      IADD  offsetFc0, offsetFc0, k;
+--:-:-:-:1      IADD  offsetFc1, offsetFc1, k;
+--:-:-:-:1      LEA      track0F0.CC, offsetFc0, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0F1,    offsetFc0, param_F[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1F0.CC, offsetFc1, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1F1,    offsetFc1, param_F[1], RZ, $dshift;
+    };
++]
+[+
+    our ($K1, $dtype, $vsize, $dsize);
+    return $K1 ? qq{
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+--:-:-:-:1 \@!P0 MOV F00, RZ;
+--:-:-:-:1 \@!P1 MOV F01, RZ;
+--:-:-:-:1 \@!P2 MOV F02, RZ;
+--:-:-:-:1 \@!P3 MOV F03, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>];
+--:-:1:-:1  \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>];
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1      SHF.L.U64 preds, preds, 4, preds;
+--:-:-:-:1 \@!P0 MOV F10, RZ;
+--:-:-:-:1 \@!P1 MOV F11, RZ;
+--:-:-:-:1 \@!P2 MOV F12, RZ;
+--:-:-:-:1 \@!P3 MOV F13, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>];
+
+--:-:-:-:1  \@P6 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P6 R2P PR,    RZ, 0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+--:-:-:-:1 \@!P0 MOV F20, RZ;
+--:-:-:-:1 \@!P1 MOV F21, RZ;
+--:-:-:-:1 \@!P2 MOV F22, RZ;
+--:-:-:-:1 \@!P3 MOV F23, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>];
+--:-:3:-:1  \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>];
+
+--:-:-:-:1  \@P6 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P6 R2P PR,    RZ, 0x0f;
+--:-:-:-:1      SHF.L.U64 preds, preds, 4, preds;
+--:-:-:-:1 \@!P0 MOV F30, RZ;
+--:-:-:-:1 \@!P1 MOV F31, RZ;
+--:-:-:-:1 \@!P2 MOV F32, RZ;
+--:-:-:-:1 \@!P3 MOV F33, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>];
+--:-:4:-:1  \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>];
+    } : qq{
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K,    P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k, param_Km32, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K,    P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k, param_Km32, P6;
+
+<ORDERED>
+--:-:1:-:1  \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>];
+--:-:2:-:1  \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>];
+--:-:3:-:1  \@P2 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>];
+--:-:4:-:1  \@P3 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>];
+
+--:-:-:-:1 \@!P0 LDS.U.$vsize F0, [addr_zero];
+--:-:-:-:1 \@!P1 LDS.U.$vsize F1, [addr_zero];
+--:-:-:-:1 \@!P2 LDS.U.$vsize F2, [addr_zero];
+--:-:1:-:1 \@!P3 LDS.U.$vsize F3, [addr_zero];
+</ORDERED>
+    };
++]
+</SCHEDULE_BLOCK>
+<SCHEDULE_BLOCK>
+[+
+    our ($N1, $N2, $SN, $dshift, $vsizeI);
+    return $N1 ? qq{
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I2, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice0I3, RZ, P5;
+--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;
+--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;
+--:-:-:-:1      IADD slice0I2, slice0I2, offsetIc0;
+--:-:-:-:1      IADD slice0I3, slice0I3, offsetIc0;
+--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I4.CC, slice0I2,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I5,    slice0I2,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I6.CC, slice0I3,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I7,    slice0I3,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];
+--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I01, [track0I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I02, [track0I4];
+--:-:5:-:1  \@P3 LDG.E.CI.$vsizeI I03, [track0I6];
+</ORDERED>
+--:-:-:-:1 \@!P0 MOV I00, RZ;
+--:-:-:-:1 \@!P1 MOV I01, RZ;
+--:-:-:-:1 \@!P2 MOV I02, RZ;
+--:-:-:-:1 \@!P3 MOV I03, RZ;
+
+20:-:-:-:1      ISETP.GE.AND P0, PT, slice1I0, RZ, P6;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice1I1, RZ, P6;
+--:-:-:-:1      ISETP.GE.AND P2, PT, slice1I2, RZ, P6;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I3, RZ, P6;
+--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;
+--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;
+--:-:-:-:1      IADD slice1I2, slice1I2, offsetIc1;
+--:-:-:-:1      IADD slice1I3, slice1I3, offsetIc1;
+--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I4.CC, slice1I2,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I5,    slice1I2,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I6.CC, slice1I3,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I7,    slice1I3,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I10, [track1I0];
+--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I11, [track1I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I12, [track1I4];
+--:-:6:-:1  \@P3 LDG.E.CI.$vsizeI I13, [track1I6];
+</ORDERED>
+--:-:-:-:1 \@!P0 MOV I10, RZ;
+--:-:-:-:1 \@!P1 MOV I11, RZ;
+--:-:-:-:1 \@!P2 MOV I12, RZ;
+--:-:-:-:1 \@!P3 MOV I13, RZ;
+
+    } : $N2 ? qq{
+
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;
+20:-:-:-:1      ISETP.GE.AND P2, PT, slice1I0, RZ, P6;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I1, RZ, P6;
+--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;
+--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;
+--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;
+--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;
+--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];
+--:-:5:-:1  \@P1 LDG.E.CI.$vsizeI I02, [track0I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I10, [track1I0];
+--:-:6:-:1  \@P3 LDG.E.CI.$vsizeI I12, [track1I2];
+--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero];
+--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero];
+--:-:-:-:1 \@!P2 LDS.U.$vsizeI I10, [addr_zero];
+--:-:5:-:1 \@!P3 LDS.U.$vsizeI I12, [addr_zero];
+</ORDERED>
+
+    } : $SN ? qq{
+
+10:-:-:-:1      ISETP.GE.AND P5, PT, slice0I, RZ, P5;
+20:-:-:-:1      ISETP.GE.AND P6, PT, slice1I, RZ, P6;
+--:-:-:-:1      IADD3 slice0I, slice0I, offsetIc0, n;
+--:-:-:-:1      IADD3 slice1I, slice1I, offsetIc1, n;
+--:-:-:-:1      LEA      track0I0.CC, slice0I,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, slice1I,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:5:-:1  \@P5 LDG.E.CI.$vsizeI I0, [track0I];
+--:-:6:-:1  \@P6 LDG.E.CI.$vsizeI I1, [track1I];
+--:-:-:-:1 \@!P5 LDS.U.$vsizeI I0, [addr_zero];
+--:-:5:-:1 \@!P6 LDS.U.$vsizeI I1, [addr_zero];
+</ORDERED>
+
+    } : qq{
+--:-:-:-:1      IADD3 offsetIc0, offsetIc0, sliceI0, n;
+--:-:-:-:1      IADD3 offsetIc1, offsetIc1, sliceI1, n;
+--:-:-:-:1      LEA      track0I0.CC, offsetIc0, param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    offsetIc0, param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, offsetIc1, param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    offsetIc1, param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:5:-:1  \@P5 LDG.E.CI.$vsizeI I0, [track0I];
+--:-:6:-:1  \@P6 LDG.E.CI.$vsizeI I1, [track1I];
+--:-:-:-:1 \@!P5 LDS.U.$vsizeI I0, [addr_zero];
+--:-:5:-:1 \@!P6 LDS.U.$vsizeI I1, [addr_zero];
+</ORDERED>
+    };
++]
+</SCHEDULE_BLOCK>
+
+[+
+    our ($convert_in, $K1);
+    return !$convert_in ? '' : $K1 ? qq{
+01:-:-:-:1      $convert_in F00, F00;
+--:-:-:-:1      $convert_in F01, F01;
+--:-:-:-:1      $convert_in F02, F02;
+--:-:1:-:1      $convert_in F03, F03;
+
+02:-:-:-:1      $convert_in F10, F10;
+--:-:-:-:1      $convert_in F11, F11;
+--:-:-:-:1      $convert_in F12, F12;
+--:-:2:-:1      $convert_in F13, F13;
+
+04:-:-:-:1      $convert_in F20, F20;
+--:-:-:-:1      $convert_in F21, F21;
+--:-:-:-:1      $convert_in F22, F22;
+--:-:3:-:1      $convert_in F23, F23;
+
+08:-:-:-:1      $convert_in F30, F30;
+--:-:-:-:1      $convert_in F31, F31;
+--:-:-:-:1      $convert_in F32, F32;
+--:-:4:-:1      $convert_in F33, F33;
+    } : qq{
+01:-:-:-:1      $convert_in F03, F01.H1;
+--:-:-:-:1      $convert_in F02, F01.H0;
+--:-:-:-:1      $convert_in F01, F00.H1;
+--:-:1:-:1      $convert_in F00, F00.H0;
+
+02:-:-:-:1      $convert_in F13, F11.H1;
+--:-:-:-:1      $convert_in F12, F11.H0;
+--:-:-:-:1      $convert_in F11, F10.H1;
+--:-:2:-:1      $convert_in F10, F10.H0;
+
+04:-:-:-:1      $convert_in F23, F21.H1;
+--:-:-:-:1      $convert_in F22, F21.H0;
+--:-:-:-:1      $convert_in F21, F20.H1;
+--:-:3:-:1      $convert_in F20, F20.H0;
+
+08:-:-:-:1      $convert_in F33, F31.H1;
+--:-:-:-:1      $convert_in F32, F31.H0;
+--:-:-:-:1      $convert_in F31, F30.H1;
+--:-:4:-:1      $convert_in F30, F30.H0;
+    };
++]
+[+
+    our ($convert_in, $N1, $N2);
+    return !$convert_in ? '' : $N1 ? qq{
+10:-:-:-:1      $convert_in I03, I03;
+--:-:-:-:1      $convert_in I02, I02;
+--:-:-:-:1      $convert_in I01, I01;
+--:-:5:-:1      $convert_in I00, I00;
+
+20:-:-:-:1      $convert_in I13, I13;
+--:-:-:-:1      $convert_in I12, I12;
+--:-:-:-:1      $convert_in I11, I11;
+--:-:6:-:1      $convert_in I10, I10;
+    } : $N2 ? qq{
+10:-:-:-:1      $convert_in I03, I02.H1;
+--:-:-:-:1      $convert_in I02, I02.H0;
+--:-:-:-:1      $convert_in I01, I00.H1;
+--:-:5:-:1      $convert_in I00, I00.H0;
+
+20:-:-:-:1      $convert_in I13, I12.H1;
+--:-:-:-:1      $convert_in I12, I12.H0;
+--:-:-:-:1      $convert_in I11, I10.H1;
+--:-:6:-:1      $convert_in I10, I10.H0;
+    } : qq{
+10:-:-:-:1      $convert_in I03, I01.H1;
+--:-:-:-:1      $convert_in I02, I01.H0;
+--:-:-:-:1      $convert_in I01, I00.H1;
+--:-:5:-:1      $convert_in I00, I00.H0;
+
+20:-:-:-:1      $convert_in I13, I11.H1;
+--:-:-:-:1      $convert_in I12, I11.H0;
+--:-:-:-:1      $convert_in I11, I10.H1;
+--:-:6:-:1      $convert_in I10, I10.H0;
+    };
++]
+
+01:-:-:-:1      STS.128 [writeFs + 4x<0*32>], F0;
+02:-:-:-:1      STS.128 [writeFs + 4x<1*32>], F1;
+04:-:-:-:1      STS.128 [writeFs + 4x<2*32>], F2;
+08:-:-:-:1      STS.128 [writeFs + 4x<3*32>], F3;
+
+10:-:-:-:1      STS.128 [writeIs + 4x<0*32>], I0;
+20:-:-:-:1      STS.128 [writeIs + 4x<1*32>], I1;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, posCTRS, endCTRS, PT;
+--:-:5:-:1      I2F.F32.S32 posCTRSf, posCTRS;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;
+
+<ORDERED>
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*64 + 32>];
+--:-:1:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*32 + 16>];
+</ORDERED>
+
+10:-:-:-:1      FMUL channel, posCTRSf, lutSizeRcp;
+--:-:-:-:1      FFMA channel, channel, 5.9604644775390625e-08, channel;
+--:-:5:-:1      F2I.S32.F32.TRUNC channel, channel;
+
+10:-:-:-:1      VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;
+--:-:-:-:1      ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;
+[+
+    our ($N1, $N2, $LN, $dshift, $slice_scale, $slice_offset, $slice_load);
+    return $LN ? q{
+
+--:-:-:-:1      SHL lutOffset0, lutOffset0, 3;
+--:-:-:-:1      XMAD.LO2C offsetIc0, channel, param_DHWN, RZ;
+--:-:-:-:1      XMAD      offsetFc0, channel, param_TRSK, RZ;
+
+--:-:-:-:1  @P0 IADD lutOffset1, lutOffset0, 8;
+--:-:-:-:1  @P0 MOV  offsetFc1, offsetFc0;
+--:-:-:-:1  @P0 MOV  offsetIc1, offsetIc0;
+--:-:-:-:1 @!P0 MOV  lutOffset1, RZ;
+--:-:-:-:1 @!P0 IADD offsetFc1, offsetFc0, param_TRSK;
+--:-:-:-:1 @!P0 IADD offsetIc1, offsetIc0, param_DHWN;
+
+--:-:-:-:1      IADD posCTRS, posCTRS, 32;
+--:-:5:-:1  @P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut];
+--:-:6:-:1  @P5 LDS.U.64 slice1IF, [lutOffset1 + addr_lut];
+
+    } : qq{
+
+--:-:-:-:1      ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale;
+--:-:-:-:1      XMAD.LO2C offsetIc0, channel, param_DHWN, RZ;
+
+--:-:-:-:1  \@P0 IADD lutOffset1, lutOffset0, $slice_offset;
+--:-:-:-:1  \@P0 MOV  offsetIc1, offsetIc0;
+--:-:-:-:1 \@!P0 MOV  lutOffset1, sb_offset;
+--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;
+
+--:-:-:-:1      IADD posCTRS, posCTRS, 32;
+--:-:5:-:1  \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4];
+--:-:6:-:1  \@P5 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4];
+    };
++]
+
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    our ($LN, $dshift);
+    return $LN ? qq{
+10:-:-:-:1      IADD3 offsetFc0, offsetFc0, sliceF0, k;
+--:-:-:-:1      LEA      track0F0.CC, offsetFc0, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0F1,    offsetFc0, param_F[1], RZ, $dshift;
+
+20:-:-:-:1      IADD3 offsetFc1, offsetFc1, sliceF1, k;
+--:-:-:-:1      LEA      track1F0.CC, offsetFc1, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1F1,    offsetFc1, param_F[1], RZ, $dshift;
+    } : qq{
+--:-:-:-:1      IADD   track0F0.CC, track0F0, partial;
+--:-:-:-:1      IADD.X track0F1,    track0F1, RZ;
+--:-:-:-:1      IADD   track1F0.CC, track1F0, partial;
+--:-:-:-:1      IADD.X track1F1,    track1F1, RZ;
+    };
++]
+<ORDERED>
+[+
+    our ($K1, $dtype, $vsize, $dsize);
+    return $K1 ? qq{
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1  \@P5 SHF.R.U64 preds, preds, 4, preds;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>];
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1  \@P5 SHF.L.U64 preds, preds, 4, preds;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>];
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1  \@P5 SHF.R.U64 preds, preds, 4, preds;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>];
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;
+--:-:-:-:1  \@P5 SHF.L.U64 preds, preds, 4, preds;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>];
+--:-:-:-:1  \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>];
+--:-:2:-:1  \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>];
+    } : qq{
+--:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K,    P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k, param_Km32, P5;
+
+--:-:2:-:1  \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>];
+--:-:2:-:1  \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>];
+--:-:2:-:1  \@P0 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>];
+--:-:2:-:1  \@P1 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>];
+    };
++]
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    our ($N1, $N2, $SN, $dshift, $vsizeI);
+    return $N1 ? qq{
+<ORDERED>
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I2, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice0I3, RZ, P5;
+</ORDERED>
+--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;
+--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;
+--:-:-:-:1      IADD slice0I2, slice0I2, offsetIc0;
+--:-:-:-:1      IADD slice0I3, slice0I3, offsetIc0;
+--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I4.CC, slice0I2,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I5,    slice0I2,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I6.CC, slice0I3,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I7,    slice0I3,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];
+--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I01, [track0I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I02, [track0I4];
+--:-:2:-:1  \@P3 LDG.E.CI.$vsizeI I03, [track0I6];
+</ORDERED>
+--:-:-:-:1 \@!P0 MOV I00, RZ;
+--:-:-:-:1 \@!P1 MOV I01, RZ;
+--:-:-:-:1 \@!P2 MOV I02, RZ;
+--:-:-:-:1 \@!P3 MOV I03, RZ;
+
+<ORDERED>
+20:-:-:-:1      ISETP.GE.AND P0, PT, slice1I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice1I1, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P2, PT, slice1I2, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I3, RZ, P5;
+</ORDERED>
+--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;
+--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;
+--:-:-:-:1      IADD slice1I2, slice1I2, offsetIc1;
+--:-:-:-:1      IADD slice1I3, slice1I3, offsetIc1;
+--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I4.CC, slice1I2,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I5,    slice1I2,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I6.CC, slice1I3,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I7,    slice1I3,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I10, [track1I0];
+--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I11, [track1I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I12, [track1I4];
+--:3:2:-:1  \@P3 LDG.E.CI.$vsizeI I13, [track1I6];
+</ORDERED>
+--:-:-:-:1 \@!P0 MOV I10, RZ;
+--:-:-:-:1 \@!P1 MOV I11, RZ;
+--:-:-:-:1 \@!P2 MOV I12, RZ;
+--:-:-:-:1 \@!P3 MOV I13, RZ;
+
+    } : $N2 ? qq{
+<ORDERED>
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;
+20:-:-:-:1      ISETP.GE.AND P2, PT, slice1I0, RZ, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I1, RZ, P5;
+</ORDERED>
+--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;
+--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;
+--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;
+--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;
+--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];
+--:-:2:-:1  \@P1 LDG.E.CI.$vsizeI I02, [track0I2];
+--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I10, [track1I0];
+--:3:2:-:1  \@P3 LDG.E.CI.$vsizeI I12, [track1I2];
+--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero];
+--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero];
+--:-:-:-:1 \@!P2 LDS.U.$vsizeI I10, [addr_zero];
+--:-:-:-:1 \@!P3 LDS.U.$vsizeI I12, [addr_zero];
+</ORDERED>
+
+    } : $SN ? qq{
+
+10:-:-:-:1      ISETP.GE.AND P0, PT, slice0I, RZ, P5;
+20:-:-:-:1      ISETP.GE.AND P1, PT, slice1I, RZ, P5;
+--:-:-:-:1      IADD3 slice0I, slice0I, offsetIc0, n;
+--:-:-:-:1      IADD3 slice1I, slice1I, offsetIc1, n;
+--:-:-:-:1      LEA      track0I0.CC, slice0I,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    slice0I,   param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, slice1I,   param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    slice1I,   param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:2:-:1  \@P0 LDG.E.CI.$vsizeI I0, [track0I];
+--:3:2:-:1  \@P1 LDG.E.CI.$vsizeI I1, [track1I];
+--:-:-:-:1 \@!P0 LDS.U.$vsizeI I0, [addr_zero];
+--:-:-:-:1 \@!P1 LDS.U.$vsizeI I1, [addr_zero];
+</ORDERED>
+
+    } : qq{
+--:-:-:-:1      IADD3 offsetIc0, offsetIc0, sliceI0, n;
+--:-:-:-:1      IADD3 offsetIc1, offsetIc1, sliceI1, n;
+--:-:-:-:1      LEA      track0I0.CC, offsetIc0, param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track0I1,    offsetIc0, param_I[1], RZ, $dshift;
+--:-:-:-:1      LEA      track1I0.CC, offsetIc1, param_I[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1I1,    offsetIc1, param_I[1], RZ, $dshift;
+<ORDERED>
+--:-:2:-:1  \@P5 LDG.E.CI.$vsizeI I0, [track0I];
+--:3:2:-:1  \@P5 LDG.E.CI.$vsizeI I1, [track1I];
+</ORDERED>
+    };
++]
+</SCHEDULE_BLOCK>
+
+LOOP:
+[+
+    our ($N1, $N2, $SN, $LN, $K1, $dtype, $dshift, $dsize, $vsize, $vsizeI,
+         $convert_in, $slice_scale, $slice_offset, $slice_load);
+
+    my %insert = (
+        j0c1  => "--:-:5:-:1      I2F.F32.S32 posCTRSf, posCTRS;\n",
+        j0c3  => "--:-:-:-:1      ISETP.LT.AND P5, PT, posCTRS, endCTRS,   PT;\n",
+        j0c5  => "--:-:-:-:1      ISETP.LT.AND P6, PT, posCTRS, endCTRS32, PT;\n",
+
+        j0c15 => "10:-:-:-:1  \@P5 FMUL channel, posCTRSf, lutSizeRcp;\n",
+        j0c20 => "--:-:-:-:1  \@P5 FFMA channel, channel, 5.9604644775390625e-08, channel;\n",
+        j0c22 => "--:-:5:-:1  \@P5 F2I.S32.F32.TRUNC channel, channel;\n",
+
+        $LN ? (
+            j0c36 => "10:-:-:-:1  \@P5 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;\n" .
+                     "--:-:-:-:1  \@P5 XMAD offsetIc0, channel, param_DHWN, RZ;\n" .
+                     "--:-:-:-:1  \@P5 XMAD offsetFc0, channel, param_TRSK, RZ;\n" .
+                     "--:-:-:-:1      IADD posCTRS, posCTRS, 32;\n",
+
+            j0c38 => "--:-:-:-:1  \@P5 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;\n" .
+                     "--:-:-:-:1  \@P5 XMAD.PSL offsetIc0, channel, param_DHWN.H1, offsetIc0;\n" .
+                     "--:-:-:-:1  \@P5 SHL lutOffset0, lutOffset0, 3;\n",
+
+            j0c42 => "--:-:5:-:1  \@P5 LDS.U.64 slice0IF, [lutOffset0 + addr_lut];\n",
+
+            j0c49 => "--:-:-:-:1  \@P0 I2I.U32.U32 offsetFc1, offsetFc0;\n" .
+                     "--:-:-:-:1 \@!P0 IADD offsetFc1, offsetFc0, param_TRSK;\n",
+
+            j0c50 => "--:-:-:-:1  \@P0 I2I.U32.U32 offsetIc1, offsetIc0;\n" .
+                     "--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;\n",
+
+            j0c51 => "--:-:4:-:1 \@!P0 I2I.U32.U32 lutOffset1, RZ;\n" .
+                     "--:-:-:-:1  \@P0 IADD lutOffset1, lutOffset0, 8;\n",
+
+            j1c44 => "10:-:-:-:1  \@P5 IADD3 offsetFc0, offsetFc0, sliceF0, k;\n",
+            j1c49 => "04:-:-:-:1  \@P5 LEA      track0F0.CC, offsetFc0, param_F[0],     $dshift;\n",
+            j1c54 => "--:-:-:-:1  \@P5 LEA.HI.X track0F1,    offsetFc0, param_F[1], RZ, $dshift;\n",
+
+            j2c16 => "08:-:5:-:1  \@P5 LDS.U.64 slice1IF, [lutOffset1 + addr_lut];\n",
+
+            j3c44 => "10:-:-:-:1  \@P5 IADD3 offsetFc1, offsetFc1, sliceF1, k;\n",
+            j3c49 => "--:-:-:-:1  \@P5 LEA      track1F0.CC, offsetFc1, param_F[0],     $dshift;\n",
+            j3c54 => "--:-:-:-:1  \@P5 LEA.HI.X track1F1,    offsetFc1, param_F[1], RZ, $dshift;\n",
+
+            j5c44 => "--:-:-:-:1  \@P5 IADD3 offsetIc0, offsetIc0, sliceI0, n;\n",
+            j5c49 => "--:-:-:-:1  \@P5 LEA      track0I0.CC, offsetIc0, param_I[0],     $dshift;\n",
+            j5c54 => "--:-:-:-:1  \@P5 LEA.HI.X track0I1,    offsetIc0, param_I[1], RZ, $dshift;\n",
+            j5c60 => "20:-:2:-:1  \@P5 LDG.E.CI.$vsize I0, [track0I];\n",
+
+            j6c44 => "--:-:-:-:1  \@P5 IADD3 offsetIc1, offsetIc1, sliceI1, n;\n",
+            j6c49 => "--:-:-:-:1  \@P5 LEA      track1I0.CC, offsetIc1, param_I[0],     $dshift;\n",
+            j6c54 => "--:-:-:-:1  \@P5 LEA.HI.X track1I1,    offsetIc1, param_I[1], RZ, $dshift;\n",
+            j6c60 => "20:3:2:-:1  \@P5 LDG.E.CI.$vsize I1, [track1I];\n",
+
+        ) : (
+            j0c36 => "10:-:-:-:1  \@P5 VMAD.U16.U16 lutOffset0, -channel, lutSize, posCTRS;\n" .
+                     "--:-:-:-:1  \@P5 XMAD offsetIc0, channel, param_DHWN, RZ;\n" .
+                     "--:-:-:-:1      IADD posCTRS, posCTRS, 32;\n",
+
+            j0c39 => "--:-:-:-:1  \@P5 ISETP.LT.AND P0, PT, lutOffset0, lutSizeM1, PT;\n" .
+                     "--:-:-:-:1  \@P5 XMAD.PSL offsetIc0, channel, param_DHWN.H1, offsetIc0;\n" .
+                     "--:-:-:-:1  \@P5 ISCADD lutOffset0, lutOffset0, sb_offset, $slice_scale;\n",
+
+            j0c43 => "--:-:-:-:1  \@P5 LDS.U.$slice_load slice0I, [lutOffset0 + addr_lut4];\n",
+
+            j0c50 => "--:-:-:-:1  \@P0 I2I.U32.U32 offsetIc1, offsetIc0;\n" .
+                     "--:-:-:-:1 \@!P0 IADD offsetIc1, offsetIc0, param_DHWN;\n",
+
+            j0c51 => "--:-:4:-:1 \@!P0 I2I.U32.U32 lutOffset1, sb_offset;\n" .
+                     "--:-:-:-:1  \@P0 IADD lutOffset1, lutOffset0, $slice_offset;\n",
+
+            j2c16 => "08:-:-:-:1  \@P5 LDS.U.$slice_load slice1I, [lutOffset1 + addr_lut4];\n",
+
+            j1c49 => "04:-:-:-:1  \@P5 IADD   track0F0.CC, track0F0, param_K32p;\n",
+            j1c54 => "--:-:-:-:1  \@P5 IADD.X track0F1,    track0F1, RZ;\n",
+
+            j3c49 => "--:-:-:-:1  \@P5 IADD   track1F0.CC, track1F0, param_K32p;\n",
+            j3c54 => "--:-:-:-:1  \@P5 IADD.X track1F1,    track1F1, RZ;\n",
+        ),
+
+        $N1 ? (
+
+            j5c31 => "--:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I2, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I2, slice0I2, offsetIc0;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P3, PT, slice0I3, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I3, slice0I3, offsetIc0;\n",
+
+            j5c32 => "--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;\n",
+            j5c37 => "--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;\n",
+            j5c42 => "--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track0I4.CC, slice0I2,   param_I[0],     $dshift;\n",
+            j5c47 => "--:-:-:-:1      LEA.HI.X track0I5,    slice0I2,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track0I6.CC, slice0I3,   param_I[0],     $dshift;\n",
+            j5c52 => "--:-:-:-:1      LEA.HI.X track0I7,    slice0I3,   param_I[1], RZ, $dshift;\n",
+
+            j5c55 => "20:-:-:-:1 \@!P0 I2I.U32.U32 I00, RZ;\n",
+            j5c57 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I01, RZ;\n",
+            j5c59 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I02, RZ;\n",
+            j5c61 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I03, RZ;\n",
+
+            j5c56 => "--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];\n",
+            j5c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I01, [track0I2];\n",
+            j5c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I02, [track0I4];\n",
+            j5c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$vsizeI I03, [track0I6];\n",
+
+            j6c31 => "--:-:-:-:1      ISETP.GE.AND P0, PT, slice1I0, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P1, PT, slice1I1, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P2, PT, slice1I2, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I2, slice1I2, offsetIc1;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P3, PT, slice1I3, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I3, slice1I3, offsetIc1;\n",
+
+            j6c32 => "--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;\n",
+            j6c37 => "--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;\n",
+            j6c42 => "--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track1I4.CC, slice1I2,   param_I[0],     $dshift;\n",
+            j6c47 => "--:-:-:-:1      LEA.HI.X track1I5,    slice1I2,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track1I6.CC, slice1I3,   param_I[0],     $dshift;\n",
+            j6c52 => "--:-:-:-:1      LEA.HI.X track1I7,    slice1I3,   param_I[1], RZ, $dshift;\n",
+
+            j6c55 => "20:-:-:-:1 \@!P0 I2I.U32.U32 I10, RZ;\n",
+            j6c57 => "--:-:-:-:1 \@!P1 I2I.U32.U32 I11, RZ;\n",
+            j6c59 => "--:-:-:-:1 \@!P2 I2I.U32.U32 I12, RZ;\n",
+            j6c61 => "--:-:-:-:1 \@!P3 I2I.U32.U32 I13, RZ;\n",
+
+            j6c56 => "--:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I10, [track1I0];\n",
+            j6c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$vsizeI I11, [track1I2];\n",
+            j6c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$vsizeI I12, [track1I4];\n",
+            j6c62 => "--:3:2:-:1  \@P3 LDG.E.CI.$vsizeI I13, [track1I6];\n",
+
+        ) : $N2 ? (
+
+            j5c31 => "--:-:-:-:1      ISETP.GE.AND P0, PT, slice0I0, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I0, slice0I0, offsetIc0;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P1, PT, slice0I1, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice0I1, slice0I1, offsetIc0;\n",
+
+            j5c35 => "--:-:-:-:1      LEA      track0I0.CC, slice0I0,   param_I[0],     $dshift;\n",
+            j5c40 => "--:-:-:-:1      LEA.HI.X track0I1,    slice0I0,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track0I2.CC, slice0I1,   param_I[0],     $dshift;\n",
+            j5c45 => "--:-:-:-:1      LEA.HI.X track0I3,    slice0I1,   param_I[1], RZ, $dshift;\n",
+
+            j5c46 => "--:-:-:-:1 \@!P0 LDS.U.$vsizeI I00, [addr_zero];\n",
+            j5c47 => "--:-:-:-:1 \@!P1 LDS.U.$vsizeI I02, [addr_zero];\n",
+
+            j5c60 => "20:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I00, [track0I0];\n",
+            j5c62 => "--:-:2:-:1  \@P1 LDG.E.CI.$vsizeI I02, [track0I2];\n",
+
+            j6c31 => "--:-:-:-:1      ISETP.GE.AND P0, PT, slice1I0, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I0, slice1I0, offsetIc1;\n" .
+                     "--:-:-:-:1      ISETP.GE.AND P1, PT, slice1I1, RZ, P5;\n" .
+                     "--:-:-:-:1      IADD slice1I1, slice1I1, offsetIc1;\n",
+
+            j6c35 => "--:-:-:-:1      LEA      track1I0.CC, slice1I0,   param_I[0],     $dshift;\n",
+            j6c40 => "--:-:-:-:1      LEA.HI.X track1I1,    slice1I0,   param_I[1], RZ, $dshift;\n" .
+                     "--:-:-:-:1      LEA      track1I2.CC, slice1I1,   param_I[0],     $dshift;\n",
+            j6c45 => "--:-:-:-:1      LEA.HI.X track1I3,    slice1I1,   param_I[1], RZ, $dshift;\n",
+
+            j6c46 => "--:-:-:-:1 \@!P0 LDS.U.$vsizeI I10, [addr_zero];\n",
+            j6c47 => "--:-:-:-:1 \@!P1 LDS.U.$vsizeI I12, [addr_zero];\n",
+
+            j6c60 => "20:-:-:-:1  \@P0 LDG.E.CI.$vsizeI I10, [track1I0];\n",
+            j6c62 => "--:3:2:-:1  \@P1 LDG.E.CI.$vsizeI I12, [track1I2];\n",
+
+        ) : $SN ? (
+            j5c31 => "--:-:-:-:1      ISETP.GE.AND P2, PT, slice0I, RZ, P5;\n",
+            j5c45 => "--:-:-:-:1 \@!P2 LDS.U.$vsize I0, [addr_zero];\n",
+
+            j5c44 => "--:-:-:-:1  \@P5 IADD3 offsetIc0, offsetIc0, slice0I, n;\n",
+            j5c49 => "--:-:-:-:1  \@P5 LEA      track0I0.CC, offsetIc0, param_I[0],     $dshift;\n",
+            j5c54 => "--:-:-:-:1  \@P5 LEA.HI.X track0I1,    offsetIc0, param_I[1], RZ, $dshift;\n",
+            j5c60 => "20:-:2:-:1  \@P2 LDG.E.CI.$vsize I0, [track0I];\n",
+
+            j6c31 => "--:-:-:-:1      ISETP.GE.AND P2, PT, slice1I, RZ, P5;\n",
+            j6c45 => "--:-:-:-:1 \@!P2 LDS.U.$vsize I1, [addr_zero];\n",
+
+            j6c44 => "--:-:-:-:1  \@P5 IADD3 offsetIc1, offsetIc1, slice1I, n;\n",
+            j6c49 => "--:-:-:-:1  \@P5 LEA      track1I0.CC, offsetIc1, param_I[0],     $dshift;\n",
+            j6c54 => "--:-:-:-:1  \@P5 LEA.HI.X track1I1,    offsetIc1, param_I[1], RZ, $dshift;\n",
+            j6c60 => "20:3:2:-:1  \@P2 LDG.E.CI.$vsize I1, [track1I];\n",
+        ) : (),
+
+        j1c30 => "20:6:-:-:1  \@P6 STS.128 [writeFs + 4x<0*32>], F0;\n",
+        j2c30 => "20:6:-:-:1  \@P6 STS.128 [writeFs + 4x<1*32>], F1;\n",
+        j3c30 => "20:6:-:-:1  \@P6 STS.128 [writeFs + 4x<2*32>], F2;\n",
+        j4c30 => "20:6:-:-:1  \@P6 STS.128 [writeFs + 4x<3*32>], F3;\n",
+        j5c30 => "20:6:-:-:1  \@P6 STS.128 [writeIs + 4x<0*32>], I0;\n",
+        j6c30 => "20:6:-:-:1  \@P6 STS.128 [writeIs + 4x<1*32>], I1;\n",
+
+        $convert_in ? (
+            j1c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j2c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j3c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j4c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j5c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j6c5  => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            $K1 ? (
+                j1c8  => "--:-:-:-:1  \@P6 $convert_in F00, F00;\n",
+                j1c10 => "--:-:-:-:1  \@P6 $convert_in F01, F01;\n",
+                j1c12 => "--:-:-:-:1  \@P6 $convert_in F02, F02;\n",
+                j1c14 => "--:-:6:-:1  \@P6 $convert_in F03, F03;\n",
+
+                j2c8  => "--:-:-:-:1  \@P6 $convert_in F10, F10;\n",
+                j2c10 => "--:-:-:-:1  \@P6 $convert_in F11, F11;\n",
+                j2c12 => "--:-:-:-:1  \@P6 $convert_in F12, F12;\n",
+                j2c14 => "--:-:6:-:1  \@P6 $convert_in F13, F13;\n",
+
+                j3c8  => "--:-:-:-:1  \@P6 $convert_in F20, F20;\n",
+                j3c10 => "--:-:-:-:1  \@P6 $convert_in F21, F21;\n",
+                j3c12 => "--:-:-:-:1  \@P6 $convert_in F22, F22;\n",
+                j3c14 => "--:-:6:-:1  \@P6 $convert_in F23, F23;\n",
+
+                j4c8  => "--:-:-:-:1  \@P6 $convert_in F30, F30;\n",
+                j4c10 => "--:-:-:-:1  \@P6 $convert_in F31, F31;\n",
+                j4c12 => "--:-:-:-:1  \@P6 $convert_in F32, F32;\n",
+                j4c14 => "--:-:6:-:1  \@P6 $convert_in F33, F33;\n",
+            ) : (
+                j1c8  => "--:-:-:-:1  \@P6 $convert_in F03, F01.H1;\n",
+                j1c10 => "--:-:-:-:1  \@P6 $convert_in F02, F01.H0;\n",
+                j1c12 => "--:-:-:-:1  \@P6 $convert_in F01, F00.H1;\n",
+                j1c14 => "--:-:6:-:1  \@P6 $convert_in F00, F00.H0;\n",
+
+                j2c8  => "--:-:-:-:1  \@P6 $convert_in F13, F11.H1;\n",
+                j2c10 => "--:-:-:-:1  \@P6 $convert_in F12, F11.H0;\n",
+                j2c12 => "--:-:-:-:1  \@P6 $convert_in F11, F10.H1;\n",
+                j2c14 => "--:-:6:-:1  \@P6 $convert_in F10, F10.H0;\n",
+
+                j3c8  => "--:-:-:-:1  \@P6 $convert_in F23, F21.H1;\n",
+                j3c10 => "--:-:-:-:1  \@P6 $convert_in F22, F21.H0;\n",
+                j3c12 => "--:-:-:-:1  \@P6 $convert_in F21, F20.H1;\n",
+                j3c14 => "--:-:6:-:1  \@P6 $convert_in F20, F20.H0;\n",
+
+                j4c8  => "--:-:-:-:1  \@P6 $convert_in F33, F31.H1;\n",
+                j4c10 => "--:-:-:-:1  \@P6 $convert_in F32, F31.H0;\n",
+                j4c12 => "--:-:-:-:1  \@P6 $convert_in F31, F30.H1;\n",
+                j4c14 => "--:-:6:-:1  \@P6 $convert_in F30, F30.H0;\n",
+            ),
+            $N1 ? (
+                j5c8  => "--:-:-:-:1  \@P6 $convert_in I03, I03;\n",
+                j5c10 => "--:-:-:-:1  \@P6 $convert_in I02, I02;\n",
+                j5c12 => "--:-:-:-:1  \@P6 $convert_in I01, I01;\n",
+                j5c14 => "--:-:6:-:1  \@P6 $convert_in I00, I00;\n",
+
+                j6c8  => "--:-:-:-:1  \@P6 $convert_in I13, I13;\n",
+                j6c10 => "--:-:-:-:1  \@P6 $convert_in I12, I12;\n",
+                j6c12 => "--:-:-:-:1  \@P6 $convert_in I11, I11;\n",
+                j6c14 => "--:-:6:-:1  \@P6 $convert_in I10, I10;\n",
+            ) : $N2 ? (
+                j5c8  => "--:-:-:-:1  \@P6 $convert_in I03, I02.H1;\n",
+                j5c10 => "--:-:-:-:1  \@P6 $convert_in I02, I02.H0;\n",
+                j5c12 => "--:-:-:-:1  \@P6 $convert_in I01, I00.H1;\n",
+                j5c14 => "--:-:6:-:1  \@P6 $convert_in I00, I00.H0;\n",
+
+                j6c8  => "--:-:-:-:1  \@P6 $convert_in I13, I12.H1;\n",
+                j6c10 => "--:-:-:-:1  \@P6 $convert_in I12, I12.H0;\n",
+                j6c12 => "--:-:-:-:1  \@P6 $convert_in I11, I10.H1;\n",
+                j6c14 => "--:-:6:-:1  \@P6 $convert_in I10, I10.H0;\n",
+            ) : (
+                j5c8  => "--:-:-:-:1  \@P6 $convert_in I03, I01.H1;\n",
+                j5c10 => "--:-:-:-:1  \@P6 $convert_in I02, I01.H0;\n",
+                j5c12 => "--:-:-:-:1  \@P6 $convert_in I01, I00.H1;\n",
+                j5c14 => "--:-:6:-:1  \@P6 $convert_in I00, I00.H0;\n",
+
+                j6c8  => "--:-:-:-:1  \@P6 $convert_in I13, I11.H1;\n",
+                j6c10 => "--:-:-:-:1  \@P6 $convert_in I12, I11.H0;\n",
+                j6c12 => "--:-:-:-:1  \@P6 $convert_in I11, I10.H1;\n",
+                j6c14 => "--:-:6:-:1  \@P6 $convert_in I10, I10.H0;\n",
+            ),
+        ) : (
+            j1c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j2c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j3c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j4c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j5c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+            j6c27 => "--:-:-:-:1      DEPBAR.LE SB1, 5;\n",
+        ),
+
+        $K1 ? (
+            j1c31 => "--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;\n",
+            j1c32 => "--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;\n",
+            j1c33 => "--:-:-:-:1  \@P5 SHF.R.U64 preds, preds, 4, preds;\n",
+            j1c56 => "20:-:-:-:1  \@P0 LDG.E.CI.$dtype F00, [track0F + ${dsize}x<0>];\n",
+            j1c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype F01, [track0F + ${dsize}x<1>];\n",
+            j1c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype F02, [track0F + ${dsize}x<2>];\n",
+            j1c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$dtype F03, [track0F + ${dsize}x<3>];\n",
+
+            j2c31 => "--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;\n",
+            j2c32 => "--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;\n",
+            j2c33 => "--:-:-:-:1  \@P5 SHF.L.U64 preds, preds, 4, preds;\n",
+            j2c56 => "20:-:-:-:1  \@P0 LDG.E.CI.$dtype F10, [track0F + ${dsize}x<32>];\n",
+            j2c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype F11, [track0F + ${dsize}x<33>];\n",
+            j2c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype F12, [track0F + ${dsize}x<34>];\n",
+            j2c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$dtype F13, [track0F + ${dsize}x<35>];\n",
+
+            j3c31 => "--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;\n",
+            j3c32 => "--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;\n",
+            j3c33 => "--:-:-:-:1  \@P5 SHF.R.U64 preds, preds, 4, preds;\n",
+            j3c56 => "20:-:-:-:1  \@P0 LDG.E.CI.$dtype F20, [track1F + ${dsize}x<0>];\n",
+            j3c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype F21, [track1F + ${dsize}x<1>];\n",
+            j3c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype F22, [track1F + ${dsize}x<2>];\n",
+            j3c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$dtype F23, [track1F + ${dsize}x<3>];\n",
+
+            j4c31 => "--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;\n",
+            j4c32 => "--:-:-:-:1 \@!P5 R2P PR,    RZ, 0x0f;\n",
+            j4c33 => "--:-:-:-:1  \@P5 SHF.L.U64 preds, preds, 4, preds;\n",
+            j4c56 => "20:-:-:-:1  \@P0 LDG.E.CI.$dtype F30, [track1F + ${dsize}x<32>];\n",
+            j4c58 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype F31, [track1F + ${dsize}x<33>];\n",
+            j4c60 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype F32, [track1F + ${dsize}x<34>];\n",
+            j4c62 => "--:-:2:-:1  \@P3 LDG.E.CI.$dtype F33, [track1F + ${dsize}x<35>];\n",
+
+        ) : (
+            j0c52 => "--:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K,    P5;\n",
+            j0c53 => "--:-:-:-:1      ISETP.LT.AND P1, PT, k, param_Km32, P5;\n",
+
+            j1c60 => "20:-:2:-:1  \@P0 LDG.E.CI.$vsize F0, [track0F + ${dsize}x<00>];\n",
+            j2c60 => "20:-:2:-:1  \@P1 LDG.E.CI.$vsize F1, [track0F + ${dsize}x<32>];\n",
+            j3c60 => "20:-:2:-:1  \@P0 LDG.E.CI.$vsize F2, [track1F + ${dsize}x<00>];\n",
+            j4c60 => "20:-:2:-:1  \@P1 LDG.E.CI.$vsize F3, [track1F + ${dsize}x<32>];\n",
+        ),
+
+        j6c63 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readFs,  readFs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readIs,  readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeFs, writeFs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeIs, writeIs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P6 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P6' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+
+--:-:-:-:1      ISETP.EQ.AND P4, PT, RZ, param_flags, PT;
+
+--:-:-:-:1      LOP.AND tid_31, tid, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid, 5;
+
+// readFs = (tid_32 << 7 + tid_31) << 2
+--:-:-:-:1      ISCADD readOs, tid_32, tid_31, 7;
+--:-:-:-:1      SHL    readOs, readOs, 2;
+
+--:-:-:-:1      ISETP.EQ.AND P6, PT, tid_31, RZ, PT;
+
+// k = idx_K*64 + tid_32
+--:-:-:-:1      ISCADD  k00, idx_K, tid_32, 6;
+--:-:-:-:1      IADD    k04, k00, 4;
+--:-:-:-:1      IADD    k08, k00, 8;
+--:-:-:-:1      IADD    k12, k00, 12;
+
+[+
+    our $bsum; return $bsum ? q{
+--:-:-:-:1      XMAD      bsum_offset, idx_Q, param_gridN,   idx_N;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_P, param_gridQN,  bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_M, param_gridPQN, bsum_offset;
+    } : '';
++]
+
+[+
+    our $LN; return $LN ? q{
+// n = idx_N*32 + tid31;
+--:-:-:-:1      ISCADD N, idx_N, tid_31, 5;
+// n < N
+--:-:-:-:1      ISETP.LT.AND P4, PT, N, param_N, P4;
+
+// o = k*MPQN + m*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD      offset, idx_Q, param_N,   N;
+--:-:-:-:1      XMAD.LO2C offset, idx_P, param_QN,  offset;
+--:-:-:-:1      XMAD.LO2C offset, idx_M, param_PQN, offset;
+
+    } : q{
+
+--:-:-:-:1      SHL M, idx_M, param_shiftM;
+--:-:-:-:1      SHL P, idx_P, param_shiftP;
+--:-:-:-:1      SHL Q, idx_Q, param_shiftQ;
+--:-:-:-:1      SHL N, idx_N, param_shiftN;
+
+--:-:-:-:1      BFE.U32 super_M, tid_31, param_SuperM;
+--:-:-:-:1      BFE.U32 super_P, tid_31, param_SuperP;
+--:-:-:-:1      BFE.U32 super_Q, tid_31, param_SuperQ;
+--:-:-:-:1      LOP.AND super_N, tid_31, param_SuperN;
+
+--:-:-:-:1      IADD M, M, super_M;
+--:-:-:-:1      IADD P, P, super_P;
+--:-:-:-:1      IADD Q, Q, super_Q;
+--:-:-:-:1      IADD N, N, super_N;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, M, param_M, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, P, param_P, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, Q, param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, N, param_N, P0;
+--:-:-:-:1      PSETP.AND.AND P4, PT, P0, P1, P2;
+
+// o = k*MPQN + m*PQN + p*QN + q*N + N
+--:-:-:-:1      XMAD      offset, Q, param_N,   N;
+--:-:-:-:1      XMAD.LO2C offset, P, param_QN,  offset;
+--:-:-:-:1      XMAD.LO2C offset, M, param_PQN, offset;
+    };
++]
+--:-:-:-:1      XMAD.LO2C offset, k00, param_MPQN, offset;
+
+--:-:-:-:1      MOV MPQN16, param_MPQN;
+--:-:-:-:1      SHL MPQN4,  MPQN16, [+ dshift()+2 +];
+--:-:-:-:1      SHL MPQN16, MPQN16, 4;
+
+--:-:-:-:1      MOV32I one, 1.0;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:1      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:1      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y1;
+--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y1;
+--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y2;
+--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y2;
+--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y3;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_O;
+--:-:-:-:0      IADD readOs, readOs, 4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_O;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:0      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:1      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:1      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:1      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:1      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:1      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:1      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 00>], shuffle_x0y4;
+--:-:-:-:1      STS.128 [writeOs+4x<0*128 + 16>], shuffle_x4y4;
+--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 00>], shuffle_x0y5;
+--:-:-:-:1      STS.128 [writeOs+4x<1*128 + 16>], shuffle_x4y5;
+--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 00>], shuffle_x0y6;
+--:-:-:-:1      STS.128 [writeOs+4x<2*128 + 16>], shuffle_x4y6;
+--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeOs+4x<3*128 + 16>], shuffle_x4y7;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:0      IADD readOs, readOs, -4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_O;
+--:-:-:-:0      IADD readOs, readOs,  4x<16*128 + 4*16>;
+--:-:-:-:5      CAL STORE_O;
+
+--:-:-:-:5      EXIT;
+
+STORE_O:
+
+--:-:-:-:2      ISETP.LT.AND P0, PT, k00, param_K, P4; // k00 < K && n < N
+--:-:-:-:2      ISETP.LT.AND P1, PT, k04, param_K, P4; // k04 < K && n < N
+--:-:-:-:2      ISETP.LT.AND P2, PT, k08, param_K, P4; // k08 < K && n < N
+--:-:-:-:1      ISETP.LT.AND P3, PT, k12, param_K, P4; // k12 < K && n < N
+[+
+    our ($beta, $brelu, $bprelu, $dshift, $dtype);
+    return $beta || $brelu || $bprelu ? qq{
+<SCHEDULE_BLOCK>
+01:-:-:-:1      LEA      Out00_0.CC, offset, param_X[0],     $dshift;
+--:-:-:-:1      LEA.HI.X Out00_1,    offset, param_X[1], RZ, $dshift;
+--:-:-:-:1      IADD     Out04_0.CC, Out00_0, MPQN4;
+--:-:-:-:1      IADD.X   Out04_1,    Out00_1, RZ;
+--:-:-:-:1      IADD     Out08_0.CC, Out04_0, MPQN4;
+--:-:-:-:1      IADD.X   Out08_1,    Out04_1, RZ;
+--:-:-:-:1      IADD     Out12_0.CC, Out08_0, MPQN4;
+--:-:-:-:1      IADD.X   Out12_1,    Out08_1, RZ;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype b00, [Out00_0];
+--:-:-:-:1 \@!P0 MOV b00, RZ;
+--:-:5:-:1  \@P1 LDG.E.CI.$dtype b04, [Out04_0];
+--:-:-:-:1 \@!P1 MOV b04, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype b08, [Out08_0];
+--:-:-:-:1 \@!P2 MOV b08, RZ;
+--:-:6:-:1  \@P3 LDG.E.CI.$dtype b12, [Out12_0];
+--:-:-:-:1 \@!P3 MOV b12, RZ;
+</ORDERED>
+</SCHEDULE_BLOCK>
+    } : '';
++]
+[+
+    our $bias;
+    return $bias ? q{
+<SCHEDULE_BLOCK>
+20:-:-:-:1      LEA      Sum00_0.CC, k00, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum00_1,    k00, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum04_0.CC, k04, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum04_1,    k04, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum08_0.CC, k08, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum08_1,    k08, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum12_0.CC, k12, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum12_1,    k12, param_Sum[1], RZ, 2;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI b00, [Sum00_0];
+--:-:-:-:1 @!P0 MOV b00, RZ;
+--:-:5:-:1  @P1 LDG.E.CI b04, [Sum04_0];
+--:-:-:-:1 @!P1 MOV b04, RZ;
+--:-:-:-:1  @P2 LDG.E.CI b08, [Sum08_0];
+--:-:-:-:1 @!P2 MOV b08, RZ;
+--:-:6:-:1  @P3 LDG.E.CI b12, [Sum12_0];
+--:-:-:-:1 @!P3 MOV b12, RZ;
+</ORDERED>
+</SCHEDULE_BLOCK>
+    } : '';
++]
+--:-:-:-:1      LDS o00_0, [readOs + 4x< 0*128 + 0*32 + 0*16>];
+--:-:-:-:1      LDS o00_1, [readOs + 4x< 0*128 + 1*32 + 0*16>];
+--:-:-:-:1      LDS o00_2, [readOs + 4x< 0*128 + 2*32 + 0*16>];
+--:-:1:Y:1      LDS o00_3, [readOs + 4x< 0*128 + 3*32 + 0*16>];
+--:-:-:-:1      LDS o04_0, [readOs + 4x< 4*128 + 0*32 + 1*16>];
+--:-:-:-:1      LDS o04_1, [readOs + 4x< 4*128 + 1*32 + 1*16>];
+--:-:-:-:1      LDS o04_2, [readOs + 4x< 4*128 + 2*32 + 1*16>];
+--:-:2:Y:1      LDS o04_3, [readOs + 4x< 4*128 + 3*32 + 1*16>];
+--:-:-:-:1      LDS o08_0, [readOs + 4x< 8*128 + 0*32 + 2*16>];
+--:-:-:-:1      LDS o08_1, [readOs + 4x< 8*128 + 1*32 + 2*16>];
+--:-:-:-:1      LDS o08_2, [readOs + 4x< 8*128 + 2*32 + 2*16>];
+--:-:3:Y:1      LDS o08_3, [readOs + 4x< 8*128 + 3*32 + 2*16>];
+--:-:-:-:1      LDS o12_0, [readOs + 4x<12*128 + 0*32 + 3*16>];
+--:-:-:-:1      LDS o12_1, [readOs + 4x<12*128 + 1*32 + 3*16>];
+--:-:-:-:1      LDS o12_2, [readOs + 4x<12*128 + 2*32 + 3*16>];
+--:-:4:Y:1      LDS o12_3, [readOs + 4x<12*128 + 3*32 + 3*16>];
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      FADD o00_0, o00_0, o00_1;
+--:-:-:-:1      FADD o00_2, o00_2, o00_3;
+02:-:-:-:1      FADD o04_0, o04_0, o04_1;
+--:-:-:-:1      FADD o04_2, o04_2, o04_3;
+04:-:-:-:1      FADD o08_0, o08_0, o08_1;
+--:-:-:-:1      FADD o08_2, o08_2, o08_3;
+08:-:-:-:1      FADD o12_0, o12_0, o12_1;
+--:-:-:-:1      FADD o12_2, o12_2, o12_3;
+
+--:-:-:-:1      FADD out00, o00_0, o00_2;
+--:-:-:-:1      FADD out04, o04_0, o04_2;
+--:-:-:-:1      FADD out08, o08_0, o08_2;
+--:-:-:-:3      FADD out12, o12_0, o12_2;
+[+
+    our $bias; return $bias ? q{
+10:-:-:-:1      FADD out00, out00, b00;
+--:-:-:-:1      FADD out04, out04, b04;
+20:-:-:-:1      FADD out08, out08, b08;
+--:-:-:-:1      FADD out12, out12, b12;
+    } : '';
++]
+[+
+    our $relu; return $relu ? q{
+// maximum(x, 0)
+--:-:-:-:1      FMNMX out00, out00, RZ, !PT;
+--:-:-:-:1      FMNMX out04, out04, RZ, !PT;
+--:-:-:-:1      FMNMX out08, out08, RZ, !PT;
+--:-:-:-:1      FMNMX out12, out12, RZ, !PT;
+    } : '';
++]
+[+
+    our $prelu; return $prelu ? q{
+// maximum(x, 0) + slope * minimum(0, x)
+--:-:-:-:1      FMNMX b00, out00, RZ, !PT;
+--:-:-:-:1      FMNMX b04, out04, RZ, !PT;
+--:-:-:-:1      FMNMX b08, out08, RZ, !PT;
+--:-:-:-:1      FMNMX b12, out12, RZ, !PT;
+
+--:-:-:-:1      FMNMX x00, out00, RZ, PT;
+--:-:-:-:1      FMNMX x04, out04, RZ, PT;
+--:-:-:-:1      FMNMX x08, out08, RZ, PT;
+--:-:-:-:1      FMNMX x12, out12, RZ, PT;
+
+--:-:-:-:1      FFMA out00, x00, param_beta, b00;
+--:-:-:-:1      FFMA out04, x04, param_beta, b04;
+--:-:-:-:1      FFMA out08, x08, param_beta, b08;
+--:-:-:-:1      FFMA out12, x12, param_beta, b12;
+    } : '';
++]
+</SCHEDULE_BLOCK>
+<SCHEDULE_BLOCK>
+[+
+    our ($beta, $brelu, $bprelu, $convert_in);
+    return $convert_in && ($beta || $brelu || $bprelu) ? qq{
+10:-:1:-:1  \@P0 $convert_in b00, b00;
+--:-:2:-:1  \@P1 $convert_in b04, b04;
+20:-:3:-:1  \@P2 $convert_in b08, b08;
+--:-:4:-:1  \@P3 $convert_in b12, b12;
+    } : '';
++]
+[+
+    our $beta; return $beta ? q{
+11:-:-:-:1      FFMA out00, b00, param_beta, out00;
+02:-:-:-:1      FFMA out04, b04, param_beta, out04;
+24:-:-:-:1      FFMA out08, b08, param_beta, out08;
+08:-:-:-:1      FFMA out12, b12, param_beta, out12;
+    } : '';
++]
+[+
+    our $brelu; return $brelu ? q{
+//delta *= x > 0
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
+02:-:-:-:1      FSETP.GT.AND P1, PT, b04, RZ, PT;
+24:-:-:-:1      FSETP.GT.AND P2, PT, b08, RZ, PT;
+08:-:-:-:1      FSETP.GT.AND P3, PT, b12, RZ, PT;
+--:-:-:-:1 @!P0 MOV out00, RZ;
+--:-:-:-:1 @!P1 MOV out04, RZ;
+--:-:-:-:1 @!P2 MOV out08, RZ;
+--:-:-:-:1 @!P3 MOV out12, RZ;
+--:-:-:Y:d      R2P PR, preds, 0x0f;
+
+    } : '';
++]
+[+
+    our $bprelu; return $bprelu ? q{
+//delta *= ((x > 0) + slope * (x < 0))
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
+02:-:-:-:1      FSETP.GT.AND P1, PT, b04, RZ, PT;
+24:-:-:-:1      FSETP.GT.AND P2, PT, b08, RZ, PT;
+08:-:-:-:1      FSETP.GT.AND P3, PT, b12, RZ, PT;
+--:-:-:-:1      SEL x00, one, RZ, P0;
+--:-:-:-:1      SEL x04, one, RZ, P1;
+--:-:-:-:1      SEL x08, one, RZ, P2;
+--:-:-:-:1      SEL x12, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b00, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b04, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b08, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b12, RZ, PT;
+--:-:-:-:1      SEL b00, one, RZ, P0;
+--:-:-:-:1      SEL b04, one, RZ, P1;
+--:-:-:-:1      SEL b08, one, RZ, P2;
+--:-:-:-:1      SEL b12, one, RZ, P3;
+--:-:-:-:1      R2P PR, preds, 0x0f;
+--:-:-:-:1      FFMA b00, b00, param_beta, x00;
+--:-:-:-:1      FFMA b04, b04, param_beta, x04;
+--:-:-:-:1      FFMA b08, b08, param_beta, x08;
+--:-:-:-:1      FFMA b12, b12, param_beta, x12;
+--:-:-:-:1      FMUL out00, out00, b00;
+--:-:-:-:1      FMUL out04, out04, b04;
+--:-:-:-:1      FMUL out08, out08, b08;
+--:-:-:-:2      FMUL out12, out12, b12;
+    } : '';
++]
+[+
+    our $bsum; return $bsum ? q{
+20:-:-:-:1      SEL sum00, out00, RZ, P0;
+--:-:-:-:1      SEL sum04, out04, RZ, P1;
+--:-:-:-:1      SEL sum08, out08, RZ, P2;
+--:-:-:-:1      SEL sum12, out12, RZ, P3;
+    } : '';
++]
+</SCHEDULE_BLOCK>
+[+
+    our $convert_out; return $convert_out ? qq{
+--:-:1:-:1  \@P0 $convert_out out00, out00;
+--:-:2:-:1  \@P1 $convert_out out04, out04;
+--:-:3:-:1  \@P2 $convert_out out08, out08;
+--:-:4:-:1  \@P3 $convert_out out12, out12;
+    } : '';
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out00_0.CC, offset, param_O[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X Out00_1,    offset, param_O[1], RZ, [+ dshift() +];
+--:-:-:-:1      IADD     Out04_0.CC, Out00_0, MPQN4;
+--:-:-:-:1      IADD.X   Out04_1,    Out00_1, RZ;
+--:-:-:-:1      IADD     Out08_0.CC, Out04_0, MPQN4;
+--:-:-:-:1      IADD.X   Out08_1,    Out04_1, RZ;
+--:-:-:-:1      IADD     Out12_0.CC, Out08_0, MPQN4;
+--:-:-:-:1      IADD.X   Out12_1,    Out08_1, RZ;
+
+01:-:-:-:1  @P0 STG.E.CG.[+ dtype() +] [Out00_0], out00;
+02:-:-:-:1  @P1 STG.E.CG.[+ dtype() +] [Out04_0], out04;
+04:-:-:-:1  @P2 STG.E.CG.[+ dtype() +] [Out08_0], out08;
+08:1:-:-:1  @P3 STG.E.CG.[+ dtype() +] [Out12_0], out12;
+</SCHEDULE_BLOCK>
+
+[+
+    our $bsum; return $bsum ? q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      XMAD.LO2C bsum00, k00, param_gridMPQN, bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum04, k04, param_gridMPQN, bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum08, k08, param_gridMPQN, bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum12, k12, param_gridMPQN, bsum_offset;
+--:-:-:-:1      LEA      Sum00_0.CC, bsum00, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum00_1,    bsum00, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum04_0.CC, bsum04, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum04_1,    bsum04, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum08_0.CC, bsum08, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum08_1,    bsum08, param_Sum[1], RZ, 2;
+--:-:-:-:1      LEA      Sum12_0.CC, bsum12, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum12_1,    bsum12, param_Sum[1], RZ, 2;
+--:-:-:-:1      ISETP.LT.AND P0, PT, k00, param_K, P6; // k00 < K && tid31 == 0
+--:-:-:-:1      ISETP.LT.AND P1, PT, k04, param_K, P6; // k04 < K && tid31 == 0
+--:-:-:-:1      ISETP.LT.AND P2, PT, k08, param_K, P6; // k08 < K && tid31 == 0
+--:-:-:-:1      ISETP.LT.AND P3, PT, k12, param_K, P6; // k12 < K && tid31 == 0
+<ORDERED>
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00,  1, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04,  1, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08,  1, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12,  1, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:1      FADD   sum12, x12, sum12;
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00,  2, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04,  2, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08,  2, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12,  2, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:1      FADD   sum12, x12, sum12;
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00,  4, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04,  4, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08,  4, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12,  4, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:1      FADD   sum12, x12, sum12;
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00,  8, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04,  8, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08,  8, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12,  8, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:1      FADD   sum12, x12, sum12;
+--:-:-:-:1      SHFL.BFLY PT, x00, sum00, 16, 0x1f;
+--:-:5:-:1      SHFL.BFLY PT, x04, sum04, 16, 0x1f;
+--:-:-:-:1      SHFL.BFLY PT, x08, sum08, 16, 0x1f;
+--:-:6:-:1      SHFL.BFLY PT, x12, sum12, 16, 0x1f;
+10:-:-:-:1      FADD   sum00, x00, sum00;
+--:-:-:-:1      FADD   sum04, x04, sum04;
+20:-:-:-:1      FADD   sum08, x08, sum08;
+--:-:-:-:0      FADD   sum12, x12, sum12;
+</ORDERED>
+</SCHEDULE_BLOCK>
+--:-:-:-:1  @P0 STG.E.CG [Sum00_0], sum00;
+--:-:-:-:1  @P1 STG.E.CG [Sum04_0], sum04;
+--:-:-:-:1  @P2 STG.E.CG [Sum08_0], sum08;
+--:6:-:-:1  @P3 STG.E.CG [Sum12_0], sum12;
+    } : '';
++]
+
+--:-:-:-:1      IADD k00, k00, 16;
+--:-:-:-:1      IADD k04, k04, 16;
+--:-:-:-:1      IADD k08, k08, 16;
+--:-:-:-:1      IADD k12, k12, 16;
+--:-:-:-:0      IADD offset, offset, MPQN16;
+
+--:-:-:-:5      RET;
\ No newline at end of file
diff --git a/Kernel/Convolution/Pascal/xconv_winograd_2x2_3x3_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_2x2_3x3_32x32.sass
new file mode 100644
index 0000000..a8a1ef4
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_winograd_2x2_3x3_32x32.sass
@@ -0,0 +1,1568 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $type;
+our $dtype  = $type eq 'h' ?  'U16' :  '32';
+our $dshift = $type eq 'h' ?    '1' :   '2';
+our $dsize  = $type eq 'h' ?    '2' :   '4';
+our $vsize  = $type eq 'h' ?   '64' : '128';
+
+sub dtype  { return $dtype;  }
+sub dsize  { return $dsize;  }
+sub dshift { return $dshift; }
+
+our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' : '';
+our $convert_out = $type eq 'h' ? 'F2F.F16.F32' : '';
+-]
+
+<CONSTANT_MAPPING>
+    param_S[0]         : c[0x0][0x140]
+    param_S[1]         : c[0x0][0x144]
+    param_X[0]         : c[0x0][0x148]
+    param_X[1]         : c[0x0][0x14c]
+    param_O[0]         : c[0x0][0x150]
+    param_O[1]         : c[0x0][0x154]
+    param_I[0]         : c[0x0][0x158]
+    param_I[1]         : c[0x0][0x15c]
+    param_F[0]         : c[0x0][0x160]
+    param_F[1]         : c[0x0][0x164]
+    param_alpha        : c[0x0][0x168]
+    param_beta         : c[0x0][0x16c]
+    param_flags        : c[0x0][0x170]
+    param_C            : c[0x0][0x174]
+    param_H            : c[0x0][0x178]
+    param_P            : c[0x0][0x17c]
+    param_pad_h        : c[0x0][0x180]
+    param_pad_w        : c[0x0][0x184]
+    param_HWN          : c[0x0][0x188]
+    param_WN           : c[0x0][0x18c]
+    param_PQN          : c[0x0][0x190]
+    param_QN           : c[0x0][0x194]
+    param_Qnk          : c[0x0][0x198]
+    param_nk           : c[0x0][0x19c]
+    param_n            : c[0x0][0x1a0]
+    param_k            : c[0x0][0x1a4]
+    param_magic_Qnk    : c[0x0][0x1a8]
+    param_shift_Qnk    : c[0x0][0x1ac]
+    param_magic_nk     : c[0x0][0x1b0]
+    param_shift_nk     : c[0x0][0x1b4]
+    param_magic_k      : c[0x0][0x1b8]
+    param_shift_k      : c[0x0][0x1bc]
+    param_RSK          : c[0x0][0x1c0]
+    param_4RSKp        : c[0x0][0x1c4]
+    param_4HWNp        : c[0x0][0x1c8]
+    param_gridK        : c[0x0][0x1cc]
+    param_gridP2       : c[0x0][0x1d0]
+    param_gridQ        : c[0x0][0x1d4]
+    param_gridN        : c[0x0][0x1d8]
+    param_gridQN       : c[0x0][0x1dc]
+    param_gridPQN      : c[0x0][0x1e0]
+    param_superP       : c[0x0][0x1e4]
+    param_superQ       : c[0x0][0x1e8]
+    param_superN       : c[0x0][0x1ec]
+    param_shiftP       : c[0x0][0x1f0]
+    param_shiftQ       : c[0x0][0x1f4]
+    param_shiftN       : c[0x0][0x1f8]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-79 : j0Ix<0-7>, j0Fy<0-7>
+      80-95 : j1Ix<0-7>, j1Fy<0-7>
+
+      64-79 ~ tid, idx_P, idx_Q, idx_N, idx_K, idx_n, idx_k, tid16, tid31, c, addr_zero, partialC
+     80-119 ~ tid1, idx_PQnk, idx_Qnk, idx_nk, magic_Qnk, neg_Qnk, neg_nk, neg_k, div<1-3>, idx_P2, idx_Q2, z<1-2>, negOne, super_P, super_Q
+      80-95 ~ super_N, y, x, ti, ti_sign, x<1-3>, mask_x, preds1, offsetIC
+      80-95 ~ tf, tid31_4, offsetFC
+
+    120-121 : track<0-1>
+    122-127 ~ writeS, readFs, readIs, C, preds, idx_nkpq
+
+      80-95 ~ p, q, n, tid32, tid64, tid_16, tid_1, q2, p2, to, superP, superQ, superN
+      96-99 : Out<0-1>, Sum<0-1>
+    100-121 ~ alpha, one, writeCs, readCs, k, PQN15, tid_31, out_offset, bsum_offset
+
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+
+      64-79 : m0<0-3>, m1<0-3>, m2<0-3>, m3<0-3>
+      80-95 : t0<0-1>, t1<0-1>, t2<0-1>, t3<0-1>
+
+    3,2,11,10,19,18,27,26,1,0,9,8,17,16,25,24 ~ b<00|01|10|11>, x<00|01|10|11>, sum<0|1>, s0<0-1>, s1<0-1>
+
+        // Image registers (registers assigned to avoid bank conflicts)
+         96 = i00
+         97 = i01
+         98 = i02
+         99 = i03
+        100 = i30
+        101 = i31
+        102 = i32
+        103 = i33
+        105 = i13
+        104 = i12
+        107 = i11
+        106 = i10
+        108 = i23, TI23, I23
+        109 = i22, TI22
+        110 = i21, TI21
+        111 = i20, TI20, I20
+        113 = TI00, I00, TI10, I10, I21, I01
+        112 = TI01, I11
+        115 = TI02, I12
+        114 = TI03, I03, TI11, I31
+        116 = TI30, I30, TI12, I32
+        117 = TI31
+        118 = TI32
+        119 = TI33, I33, TI13, I13, I22, I02
+    // Filter registers
+[+
+    our $FX;
+    return $FX ? q{
+    104-119 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>
+    } : q{
+         96 = f00, TF00, F00
+         97 = f01, TF01
+         98 = f02, TF02, F03
+         99 = f10
+        100 = f11
+        101 = f12
+        102 = f20, TF30, F30
+        103 = f21, TF31
+        104 = f22, TF32, F33
+        105 = tb3, F32
+        106 = tb0, F02
+        107 = ta2, TF22, F23
+        108 = ta0, TF20, F20
+        109 = ta1, TF21
+        110 = F01
+        111 = F31
+        112 = TF10, F10
+        113 = TF11
+        114 = TF12, F13
+        115 = tb1, F12
+        116 = tb2, F22
+        117 = F11
+        118 = F21
+    };
++]
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,      SR_TID.X;
+--:-:2:-:1      S2R idx_PQnk, SR_CTAID.X;
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 128, PT;
+
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+
+// c = (tid & 127) / 32
+--:-:-:-:1      BFE.U32 c, tid, 0x205; // 2 bits at position 5
+
+--:-:-:-:1      SHL addr_zero, tid31, 4;
+--:-:-:-:1      ISCADD addr_zero, c, addr_zero, 11;
+--:-:-:-:1  @P0 IADD addr_zero, addr_zero, 4x<512*4>;
+
+--:-:-:-:1      STS.128 [addr_zero + 4x<00*4>], RZ;
+--:-:-:-:1      STS.128 [addr_zero + 4x<32*4>], RZ;
+--:-:-:-:1      STS.128 [addr_zero + 4x<64*4>], RZ;
+--:-:-:-:1      STS.128 [addr_zero + 4x<96*4>], RZ;
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+
+// idx_P2 = idx_PQnk / blk_Qnk
+--:-:-:-:1      MOV  magic_Qnk, param_magic_Qnk;
+--:-:-:-:1      ISETP.NE.AND P1, PT,  magic_Qnk, 1, PT;
+02:-:-:-:1  @P1 XMAD     div1, idx_PQnk,    magic_Qnk,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, idx_PQnk,    magic_Qnk.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, idx_PQnk.H1, magic_Qnk.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, idx_PQnk.H1, magic_Qnk,    div1;
+--:-:-:-:1  @P1 IADD3.RS idx_P2, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  idx_P2, idx_P2,   param_shift_Qnk;
+--:-:-:-:1 @!P1 SHR.U32  idx_P2, idx_PQnk, param_shift_Qnk;
+
+// idx_Qnk = idx_PQnk % blk_Qnk
+--:-:-:-:1      IADD neg_Qnk, RZ, -param_Qnk;
+--:-:-:-:1      XMAD.LO2 idx_Qnk, neg_Qnk, idx_P2, idx_PQnk;
+
+// idx_Q2  = idx_Qnk / nk
+--:-:-:-:1      XMAD.LO2C idx_Q2, idx_Qnk, param_magic_nk, RZ;
+--:-:-:-:1      SHR.U32   idx_Q2, idx_Q2,  param_shift_nk;
+// idx_nk = idx_Qnk % nk
+--:-:-:-:1      IADD neg_nk, RZ, -param_nk;
+--:-:-:-:1      XMAD.S16.U16  idx_nk, neg_nk, idx_Q2, idx_Qnk;
+
+// idx_n = idx_nk / k
+--:-:-:-:1      XMAD    idx_n,  idx_nk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_n,  idx_n,  param_shift_k;
+// idx_k = idx_nk % k
+--:-:-:-:1      IADD neg_k, RZ, -param_k;
+--:-:-:-:1      XMAD.S16.U16 idx_k, neg_k, idx_n, idx_nk;
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// idx_P = idx_P2 * 2
+// idx_Q = idx_Q2
+// if idx_P2 != gridP2:
+//     idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1)
+//     idx_Q  = idx_Q2 >> 1
+--:-:-:-:1      ISETP.NE.AND P1, PT, idx_P2, param_gridP2, PT;
+--:-:-:-:1      SHL idx_P, idx_P2, 1;
+--:-:-:-:1  @P1 LOP.AND z1, idx_Q2, 1;
+--:-:-:-:1  @P1 BFE.U32 z2, idx_Q2, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P1 LOP.XOR z1, z1, z2;
+--:-:-:-:1  @P1 IADD idx_P, idx_P, z1;
+--:-:-:-:1  @P1 SHR.U32 idx_Q, idx_Q2, 1;
+--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2;
+
+// Scan backwards on odd rows
+// if idx_P2 & 1:
+//     idx_Q = gridQ - idx_Q - 1
+--:-:-:-:1      LOP.AND.NZ P2, RZ, idx_P2, 1;
+--:-:-:-:1      MOV negOne, -1;
+--:-:-:-:1  @P2 IADD3 idx_Q, -idx_Q, param_gridQ, negOne;
+
+--:-:-:-:1      BFI idx_nkpq, idx_P, 0x0c0c, idx_Q;
+--:-:-:-:1      BFI idx_nkpq, idx_k, 0x0418, idx_nkpq;
+--:-:-:-:1      BFI idx_nkpq, idx_n, 0x041c, idx_nkpq;
+
+// x = grid_x << shiftX
+// y = grid_y << shiftY
+--:-:-:-:1      SHL idx_P, idx_P, param_shiftP;
+--:-:-:-:1      SHL idx_Q, idx_Q, param_shiftQ;
+
+// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
+--:-:-:-:1      BFE.U32 super_P, tid, param_superP;
+--:-:-:-:1      BFE.U32 super_Q, tid, param_superQ;
+--:-:-:-:1      ISCADD idx_P, super_P,  idx_P, 1;
+--:-:-:-:1      ISCADD idx_Q, super_Q,  idx_Q, 1;
+
+// If this value is not a multiple of 4 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 4 then make a full 4 line fetch.
+--:-:-:-:1      MOV C, param_C;
+--:-:-:-:1      LOP.AND.Z P6, partialC, C, 3;
+--:-:-:-:1 @!P6 IADD3 C, C, 4, -partialC;
+--:-:-:-:1  @P6 MOV partialC, 4;
+// P6 = c < partialC
+--:-:-:-:1      ISETP.LT.AND P6, PT, c, partialC, PT;
+
+[+
+    our $FX; return $FX ? '' : q{
+// writeS = c*512 + tid & 31
+--:-:-:-:1      ISCADD writeS, c, tid31, 9;
+--:-:-:-:1      ISCADD writeS, writeS, 4x<512*4*2>, 2;
+    }
++]
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid,   -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,  1;
+
+--:-:-:-:1      BFE.U32  readIs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
+--:-:-:-:1      SHL      readIs, readIs, 4;
+
+--:-:-:-:1      LOP.AND  tid1,   tid,    1;
+--:-:-:-:1      LOP.AND  readFs, tid,    8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      LOP3.LUT readFs, readFs, tid16, tid1, 0xfe;
+--:-:-:-:1      ISCADD   readFs, readFs, 4x<512*4>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U FILTER_SETUP;
+
+--:-:1:-:2      S2R idx_N, SR_CTAID.Z;
+
+
+<SCHEDULE_BLOCK>
+
+// writeS = c*512 + tid & 31
+[+
+    our $FX;
+    return $FX ? q{
+--:-:-:-:1      ISCADD writeS, c, tid31, 9;
+--:-:-:-:1      ISCADD writeS, writeS, 4x<512*4*2>, 2;
+    } : '';
++]
+
+--:-:-:-:1      LOP.AND super_N, tid, param_superN;
+
+01:-:-:-:1      XMAD idx_N, idx_N, param_n, idx_n;
+--:-:-:-:1      SHL  idx_N, idx_N, param_shiftN;
+--:-:-:-:1      IADD idx_N, idx_N, super_N;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P5, PT, idx_N, 1x<$N>, PT;
+
+// Subtract off the padding
+--:-:-:-:1      IADD y, idx_P, -param_pad_h;
+--:-:-:-:1      IADD x, idx_Q, -param_pad_w;
+
+// a0 = n + x*N + y*XN + c*YXN
+--:-:-:-:1      XMAD.S16.U16      ti, x,  1x<$N>,    idx_N;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti, y,  param_WN,  ti;
+--:-:-:-:1      XMAD.S16.U16.LO2C ti, c,  param_HWN, ti;
+--:-:-:-:1      ISET.LT.AND ti_sign, ti, RZ, PT;
+--:-:-:-:1      LEA    track0.CC, ti,      param_I[0], [+ dshift() +];
+--:-:-:-:1      IADD.X track1,    ti_sign, param_I[1];
+
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      IADD x2, x, 2;
+--:-:-:-:1      IADD x3, x, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  1x<$W>, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, 1x<$W>, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, 1x<$W>, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, 1x<$W>, PT;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD x1, y, 1;
+--:-:-:-:1      IADD x2, y, 2;
+--:-:-:-:1      IADD x3, y, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, y,  param_H, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_H, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_H, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_H, P5;
+--:-:-:-:1      ISETP.GE.AND P0, PT, y,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+
+--:-:-:-:1      SEL preds, mask_x, RZ, P0;
+--:-:-:-:1  @P1 BFI preds, mask_x, 0x404, preds;
+--:-:-:-:1  @P2 BFI preds, mask_x, 0x408, preds;
+--:-:-:-:1  @P3 BFI preds, mask_x, 0x40c, preds;
+
+// For partial C on first load
+--:-:-:-:1      SEL preds1, preds, RZ, P6;
+
+// offsetIC = partialC*YXN
+--:-:-:-:1      XMAD.LO2C offsetIC, partialC, param_HWN, RZ;
+
+--:-:-:-:1      R2P PR, preds1, 0x0f;
+--:-:-:-:1      SHF.R.U64 preds1, preds1, 12, preds1;
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>];
+--:-:-:-:1 @!P0 MOV i00, RZ;
+--:-:-:-:1 @!P1 MOV i01, RZ;
+--:-:-:-:1 @!P2 MOV i02, RZ;
+--:-:-:-:1 @!P3 MOV i03, RZ;
+
+--:-:-:-:1      R2P PR, preds1, 0x0f;
+--:-:-:-:1      SHF.L.U64 preds1, preds1, 8, preds1;
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>];
+--:-:-:-:1 @!P0 MOV i30, RZ;
+--:-:-:-:1 @!P1 MOV i31, RZ;
+--:-:-:-:1 @!P2 MOV i32, RZ;
+--:-:-:-:1 @!P3 MOV i33, RZ;
+
+--:-:-:-:1      R2P PR, preds1, 0x0f;
+--:-:-:-:1      SHF.R.U64 preds1, preds1, 4, preds1;
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>];
+--:-:-:-:1 @!P0 MOV i10, RZ;
+--:-:-:-:1 @!P1 MOV i11, RZ;
+--:-:-:-:1 @!P2 MOV i12, RZ;
+--:-:-:-:1 @!P3 MOV i13, RZ;
+
+--:-:-:-:1      R2P PR, preds1, 0x0f;
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>];
+--:6:2:-:1  @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>];
+--:-:-:-:1 @!P0 MOV i20, RZ;
+--:-:-:-:1 @!P1 MOV i21, RZ;
+--:-:-:-:1 @!P2 MOV i22, RZ;
+--:-:-:-:1 @!P3 MOV i23, RZ;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>];
+
+20:-:-:-:6      LEA      track0.CC, offsetIC, track0,     [+ dshift() +];
+--:-:-:-:0      LEA.HI.X track1,    offsetIC, track1, RZ, [+ dshift() +];
+
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+
+
+
+FILTER_SETUP:
+
+--:-:1:-:2      S2R idx_K, SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+[+
+    our ($dtype, $dshift, $FX, $K, $vsize, $dsize);
+    return $FX ? qq{
+
+// writeS = (c*512 + (tid & 31)*4)*4
+--:-:-:-:1      SHL writeS, tid31, 4;
+--:-:-:-:1      ISCADD writeS, c, writeS, 11;
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<512*4*2>;
+
+// (kBlks,C,4,4,32)
+// offset = idx_K*C*512 + c*512 + tid31*4;
+--:-:-:-:1      SHL    tid31_4,  tid31, 2;
+--:-:-:-:1      XMAD   tf, idx_K, param_C, c;
+--:-:-:-:1      ISCADD tf, tf, tid31_4, 9;
+--:-:-:-:1      LEA      track0.CC, tf, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1,    tf, param_F[1], RZ, $dshift;
+
+// offsetFC = partialC*512
+--:-:-:-:1      SHL  offsetFC, partialC, 9;
+
+--:-:-:-:1 \@!P6 LDS.U.$vsize F0, [addr_zero];
+--:-:-:-:1 \@!P6 LDS.U.$vsize F1, [addr_zero];
+--:-:-:-:1 \@!P6 LDS.U.$vsize F2, [addr_zero];
+--:-:-:-:1 \@!P6 LDS.U.$vsize F3, [addr_zero];
+<ORDERED>
+--:-:2:-:1  \@P6 LDG.E.CG.$vsize F0, [track + 4x<00 * $dsize>];
+--:-:3:-:1  \@P6 LDG.E.CG.$vsize F1, [track + 4x<32 * $dsize>];
+--:-:4:-:1  \@P6 LDG.E.CG.$vsize F2, [track + 4x<64 * $dsize>];
+--:6:5:-:1  \@P6 LDG.E.CG.$vsize F3, [track + 4x<96 * $dsize>];
+</ORDERED>
+
+    } : qq{
+// k = idx_K*32 + tid & 31
+--:-:-:-:1      ISCADD  idx_K, idx_K, tid31,  5;
+--:-:-:-:1      ISETP.LT.AND P0, PT, idx_K, 1x<$K>, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, idx_K, 1x<$K>, PT;
+
+// offsetFC = partialC * RSK
+--:-:-:-:1      XMAD.LO2C offsetFC, partialC, param_RSK, RZ;
+
+// a0 = k + c*RSK
+--:-:-:-:1      XMAD.LO2C tf, c, param_RSK, idx_K;
+
+--:-:-:-:1      LEA      track0.CC, tf, param_F[0],     $dshift;
+--:-:-:-:1      LEA.HI.X track1,    tf, param_F[1], RZ, $dshift;
+
+--:-:-:-:1 \@!P0 MOV f00, RZ;
+--:-:-:-:1 \@!P0 MOV f01, RZ;
+--:-:-:-:1 \@!P0 MOV f02, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];
+--:-:-:-:1 \@!P0 MOV f20, RZ;
+--:-:-:-:1 \@!P0 MOV f21, RZ;
+--:-:-:-:1 \@!P0 MOV f22, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];
+--:-:-:-:1 \@!P0 MOV f10, RZ;
+--:-:-:-:1 \@!P0 MOV f11, RZ;
+--:-:-:-:1 \@!P0 MOV f12, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];
+--:6:2:-:1  \@P0 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];
+    };
++]
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*512 + 16>];
+
+20:-:-:-:6      LEA      track0.CC, offsetFC, track0,     [+ dshift() +];
+--:-:-:-:0      LEA.HI.X track1,    offsetFC, track1, RZ, [+ dshift() +];
+
+--:-:-:-:5      BRA.U FILTER_LOOP;
+
+
+IMAGE_LOOP:
+--:-:-:-:1      ISETP.GT.AND P6, PT, C, 4, PT;
+[+
+    our ($dtype, $dsize, $convert_in, $W, $N);
+    my %insert = (
+
+        j0c1  => "--:-:-:-:1      ISETP.GT.AND P5, PT, C, RZ, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -4;\n",
+
+
+        j0c14 => "--:-:-:-:1      R2P PR, preds, 0x0f;\n",
+        j0c16 => "--:-:-:-:1  \@P6 SHF.R.U64 preds, preds, 12, preds;\n",
+
+        $convert_in ? (
+            j0c3  => "02:-:-:-:1      $convert_in i00, i00;\n",
+            j0c5  => "--:-:-:-:1      $convert_in i01, i01;\n",
+            j0c7  => "--:-:-:-:1      $convert_in i02, i02;\n",
+            j0c9  => "--:-:-:-:0 \@!P6 MOV preds, RZ;\n" .
+                     "--:-:-:-:1      $convert_in i03, i03;\n",
+
+            j0c11 => "--:-:-:-:1      $convert_in i20, i20;\n",
+            j0c13 => "--:-:-:-:1      $convert_in i21, i21;\n",
+            j0c15 => "--:-:-:-:1      $convert_in i22, i22;\n",
+            j0c17 => "--:-:2:-:1      $convert_in i23, i23;\n",
+
+            j0c19 => "--:-:-:-:1      $convert_in i10, i10;\n",
+            j0c21 => "--:-:-:-:1      $convert_in i11, i11;\n",
+            j0c23 => "--:-:-:-:1      $convert_in i12, i12;\n",
+            j0c25 => "--:-:-:-:1      $convert_in i13, i13;\n",
+
+            j0c27 => "--:-:-:-:1      $convert_in i30, i30;\n",
+            j0c29 => "--:-:-:-:1      $convert_in i31, i31;\n",
+            j0c31 => "--:-:-:-:1      $convert_in i32, i32;\n",
+            j0c33 => "--:-:3:-:1      $convert_in i33, i33;\n",
+        ) : (
+            j0c9  => "--:-:-:-:1 \@!P6 MOV preds, RZ;\n",
+        ),
+
+        j0c32 => "02:-:-:-:1  \@P5 FADD TI00, i00, -i20;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI01, i01, -i21;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI02, i02, -i22;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI03, i03, -i23;\n",
+
+        j0c35 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i00, [track + ${dsize}x<0*$W*$N + 0*$N>];\n",
+        j0c37 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i01, [track + ${dsize}x<0*$W*$N + 1*$N>];\n",
+        j0c39 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i02, [track + ${dsize}x<0*$W*$N + 2*$N>];\n",
+        j0c41 => "--:-:-:-:1  \@P3 LDG.E.CI.$dtype i03, [track + ${dsize}x<0*$W*$N + 3*$N>];\n",
+        j0c43 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i00, RZ;\n",
+        j0c45 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i01, RZ;\n",
+        j0c47 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i02, RZ;\n",
+        j0c49 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i03, RZ;\n" .
+                 "--:-:-:-:1      R2P PR, preds, 0x0f;\n",
+
+        j0c50 => "--:-:-:-:1  \@P6 SHF.L.U64 preds, preds, 8, preds;\n",
+
+        j0c55 => "04:-:-:-:1  \@P5 FADD TI30, i10, -i30;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI31, i11, -i31;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI32, i12, -i32;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI33, i13, -i33;\n",
+
+        j0c57 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i30, [track + ${dsize}x<3*$W*$N + 0*$N>];\n",
+        j0c59 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i31, [track + ${dsize}x<3*$W*$N + 1*$N>];\n",
+        j0c61 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i32, [track + ${dsize}x<3*$W*$N + 2*$N>];\n",
+        j0c63 => "--:-:-:-:1  \@P3 LDG.E.CI.$dtype i33, [track + ${dsize}x<3*$W*$N + 3*$N>];\n",
+        j1c1  => "--:-:-:-:1 \@!P0 I2F.F32.U32 i30, RZ;\n",
+        j1c3  => "--:-:-:-:1 \@!P1 I2F.F32.U32 i31, RZ;\n",
+        j1c5  => "--:-:-:-:1 \@!P2 I2F.F32.U32 i32, RZ;\n",
+        j1c7  => "--:-:-:-:1 \@!P3 I2F.F32.U32 i33, RZ;\n" .
+                 "--:-:-:-:1      R2P PR, preds, 0x0f;\n" .
+                 "--:-:-:-:1  \@P5 FADD I00, TI00, -TI02;\n" .
+                 "--:-:-:-:1  \@P5 FADD I03, TI01, -TI03;\n" .
+                 "--:-:-:-:1  \@P5 FADD I30, TI30, -TI32;\n" .
+                 "--:-:-:-:1  \@P5 FADD I33, TI31, -TI33;\n" .
+                 "--:-:-:-:1  \@P6 SHF.R.U64 preds, preds, 4, preds;\n",
+
+        j1c9  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 0)>], I00;\n",
+        j1c11 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 3)>], I03;\n",
+        j1c13 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 0)>], I30;\n",
+        j1c15 => "--:3:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 3)>], I33;\n",
+
+
+        j1c29 => "04:-:-:-:1  \@P5 FADD TI10,  i10, i20;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI20, -i10, i20;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI11,  i11, i21;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI21, -i11, i21;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI12,  i12, i22;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI22, -i12, i22;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI13,  i13, i23;\n" .
+                 "--:-:-:-:1  \@P5 FADD TI23, -i13, i23;\n",
+
+        j1c30 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i10, [track + ${dsize}x<1*$W*$N + 0*$N>];\n",
+        j1c32 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i11, [track + ${dsize}x<1*$W*$N + 1*$N>];\n",
+        j1c34 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i12, [track + ${dsize}x<1*$W*$N + 2*$N>];\n",
+        j1c36 => "--:-:-:-:1  \@P3 LDG.E.CI.$dtype i13, [track + ${dsize}x<1*$W*$N + 3*$N>];\n",
+        j1c38 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i10, RZ;\n",
+        j1c40 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i11, RZ;\n",
+        j1c42 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i12, RZ;\n",
+        j1c44 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i13, RZ;\n" .
+                 "--:-:-:-:1      R2P PR, preds, 0x0f;\n" .
+                 "--:-:-:-:1  \@P5 FADD I10, TI10, -TI12;\n" .
+                 "--:-:-:-:1  \@P5 FADD I20, TI20, -TI22;\n" .
+                 "--:-:-:-:1  \@P5 FADD I13, TI11, -TI13;\n" .
+                 "--:-:-:-:1  \@P5 FADD I23, TI21, -TI23;\n" .
+                 "--:-:-:-:1  \@P6 SHF.L.U64 preds, preds, 8, preds;\n",
+
+        j1c46 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 0)>], I10;\n",
+        j1c48 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 0)>], I20;\n",
+        j1c50 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 3)>], I13;\n",
+        j1c52 => "--:3:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 3)>], I23;\n",
+
+
+        j2c8  => "04:-:-:-:1  \@P5 FADD I21,  TI21, TI22;\n" .
+                 "--:-:-:-:1  \@P5 FADD I22, -TI21, TI22;\n",
+
+        j2c11 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 1)>], I21;\n",
+        j2c13 => "--:3:-:-:1  \@P5 STS [writeS + 4x<32*(2*4 + 2)>], I22;\n",
+
+        j2c15 => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype i20, [track + ${dsize}x<2*$W*$N + 0*$N>];\n",
+        j2c17 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype i21, [track + ${dsize}x<2*$W*$N + 1*$N>];\n",
+        j2c19 => "--:-:-:-:1  \@P2 LDG.E.CI.$dtype i22, [track + ${dsize}x<2*$W*$N + 2*$N>];\n",
+        j2c21 => "--:6:2:-:1  \@P3 LDG.E.CI.$dtype i23, [track + ${dsize}x<2*$W*$N + 3*$N>];\n",
+        j2c23 => "--:-:-:-:1 \@!P0 I2F.F32.U32 i20, RZ;\n",
+        j2c25 => "--:-:-:-:1 \@!P1 I2F.F32.U32 i21, RZ;\n",
+        j2c27 => "--:-:-:-:1 \@!P2 I2F.F32.U32 i22, RZ;\n",
+        j2c29 => "--:-:-:-:1 \@!P3 I2F.F32.U32 i23, RZ;\n",
+
+        j2c30 => "04:-:-:-:1  \@P5 FADD I01,  TI01, TI02;\n" .
+                 "--:-:-:-:1  \@P5 FADD I02, -TI01, TI02;\n" .
+                 "--:-:-:-:1  \@P5 FADD I11,  TI11, TI12;\n" .
+                 "--:-:-:-:1  \@P5 FADD I12, -TI11, TI12;\n" .
+                 "--:-:-:-:1  \@P5 FADD I31,  TI31, TI32;\n" .
+                 "--:-:-:-:1  \@P5 FADD I32, -TI31, TI32;\n",
+
+        j2c31 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 1)>], I01;\n",
+        j2c33 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*4 + 2)>], I02;\n",
+        j2c35 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 1)>], I11;\n",
+        j2c37 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*4 + 2)>], I12;\n",
+        j2c39 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 1)>], I31;\n",
+        j2c41 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*4 + 2)>], I32;\n",
+
+        j2c62 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P5 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" .
+                 "--:-:-:-:1  \@P5 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" .
+                 "--:-:-:-:1  \@P5 LOP.XOR writeS, writeS, 4x<512*4*2>;\n",
+
+        j3c57 => "20:-:-:-:1  \@P6 IADD   track0.CC, track0, param_4HWNp;\n",
+        j3c62 => "--:-:-:-:1  \@P6 IADD.X track1,    track1, RZ;\n",
+
+        j3c63 => "--:-:-:Y:5  \@P5 BRA.U IMAGE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U END_LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 3)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 4;
+        my $rsPred   = $j == 3 ? '@P5' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+FILTER_LOOP:
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, RZ, PT;
+[+
+    our ($dtype, $convert_in, $FX, $vsize, $dsize, $K);
+    my %insert = (
+
+        j0c1  => "--:-:-:-:1      ISETP.GT.AND P1, PT, C, 4, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -4;\n",
+
+        $FX ? (
+            $convert_in ? (
+                j1c8  => "02:-:-:-:1  \@P0 $convert_in F03, F01.H1;\n",
+                j1c12 => "--:-:-:-:1  \@P0 $convert_in F02, F01.H0;\n",
+                j1c16 => "--:-:-:-:1  \@P0 $convert_in F01, F00.H1;\n",
+                j1c20 => "--:-:2:-:1  \@P0 $convert_in F00, F00.H0;\n",
+
+                j1c26 => "04:-:-:-:1  \@P0 $convert_in F13, F11.H1;\n",
+                j1c30 => "--:-:-:-:1  \@P0 $convert_in F12, F11.H0;\n",
+                j1c34 => "--:-:-:-:1  \@P0 $convert_in F11, F10.H1;\n",
+                j1c38 => "--:-:3:-:1  \@P0 $convert_in F10, F10.H0;\n",
+
+                j2c8  => "08:-:-:-:1  \@P0 $convert_in F23, F21.H1;\n",
+                j2c12 => "--:-:-:-:1  \@P0 $convert_in F22, F21.H0;\n",
+                j2c16 => "--:-:-:-:1  \@P0 $convert_in F21, F20.H1;\n",
+                j2c20 => "--:-:4:-:1  \@P0 $convert_in F20, F20.H0;\n",
+
+                j2c26 => "10:-:-:-:1  \@P0 $convert_in F33, F31.H1;\n",
+                j2c30 => "--:-:-:-:1  \@P0 $convert_in F32, F31.H0;\n",
+                j2c34 => "--:-:-:-:1  \@P0 $convert_in F31, F30.H1;\n",
+                j2c38 => "--:6:5:-:1  \@P0 $convert_in F30, F30.H0;\n",
+            ) : (),
+
+            j1c22 => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 00*4>], F0;\n",
+            j1c24 => "02:-:2:-:1  \@P1 LDG.E.CG.$vsize F0, [track0 + 4x<00 * $dsize>];\n",
+
+            j1c40 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 32*4>], F1;\n",
+            j1c42 => "04:-:3:-:1  \@P1 LDG.E.CG.$vsize F1, [track0 + 4x<32 * $dsize>];\n",
+
+            j2c22 => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 64*4>], F2;\n",
+            j2c24 => "08:-:4:-:1  \@P1 LDG.E.CG.$vsize F2, [track0 + 4x<64 * $dsize>];\n",
+
+            j2c40 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<512*4 + 96*4>], F3;\n",
+            j2c42 => "10:6:5:-:1  \@P1 LDG.E.CG.$vsize F3, [track0 + 4x<96 * $dsize>];\n",
+
+            j3c57 => "20:-:-:-:1  \@P1 IADD   track0.CC, track0, 4x<32*16 * $dsize>;\n",
+            j3c62 => "--:-:-:-:1  \@P1 IADD.X track1,    track1, RZ;\n",
+
+        ) : (
+            $convert_in ? (
+                j0c5  => "02:-:-:-:1      $convert_in f00, f00;\n",
+                j0c7  => "--:-:-:-:1      $convert_in f01, f01;\n",
+                j0c9  => "--:-:-:-:1      $convert_in f02, f02;\n",
+
+                j0c11 => "--:-:-:-:1      $convert_in f20, f20;\n",
+                j0c13 => "--:-:-:-:1      $convert_in f21, f21;\n",
+                j0c15 => "--:-:2:-:1      $convert_in f22, f22;\n",
+
+                j0c17 => "--:-:-:-:1      $convert_in f10, f10;\n",
+                j0c19 => "--:-:-:-:1      $convert_in f11, f11;\n",
+                j0c21 => "--:-:4:-:1      $convert_in f12, f12;\n",
+            ) : (),
+
+            j0c33 => "02:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 0)>], F00;\n",
+            j0c35 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 3)>], F03;\n",
+            j0c37 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 0)>], F30;\n",
+            j0c39 => "--:3:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 3)>], F33;\n",
+
+            j0c40 => "--:-:-:-:1  \@P0 FADD tb0, TF00, TF02;\n" .
+                     "--:-:-:-:1  \@P0 FADD tb3, TF30, TF32;\n" .
+                     "--:-:-:-:1  \@P0 FADD ta0, f00,  f20;\n" .
+                     "--:-:-:-:1  \@P0 FADD ta1, f01,  f21;\n" .
+                     "--:-:-:-:1  \@P0 FADD ta2, f02,  f22;\n",
+
+            j0c41 => "--:-:-:-:1  \@P0 FMUL tb0, tb0, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL tb3, tb3, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL ta0, ta0, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL ta1, ta1, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL ta2, ta2, 0.5;\n",
+
+            j0c42 => "--:-:-:-:1  \@P0 FFMA F01, TF01,  0.5, tb0;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F02, TF01, -0.5, tb0;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F31, TF31,  0.5, tb3;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F32, TF31, -0.5, tb3;\n",
+
+            j0c45 => "04:-:-:-:1  \@P1 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];\n",
+            j0c47 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];\n",
+            j0c49 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];\n",
+
+            j0c51 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];\n",
+            j0c53 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];\n",
+            j0c55 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];\n",
+
+            j1c8  => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 1)>], F01;\n",
+            j1c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(0*4 + 2)>], F02;\n",
+            j1c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 1)>], F31;\n",
+            j1c14 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(3*4 + 2)>], F32;\n",
+
+            j1c15 => "08:-:-:-:1  \@P0 FFMA TF10, f10,  0.5, ta0;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF20, f10, -0.5, ta0;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF11, f11,  0.5, ta1;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF21, f11, -0.5, ta1;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF12, f12,  0.5, ta2;\n" .
+                     "--:-:-:-:1  \@P0 FFMA TF22, f12, -0.5, ta2;\n",
+
+            j1c16 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];\n",
+            j1c18 => "--:-:-:-:1  \@P1 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];\n",
+            j1c20 => "--:6:2:-:1  \@P1 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];\n",
+
+            j1c22 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 0)>], F10;\n",
+            j1c24 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 0)>], F20;\n",
+            j1c26 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 3)>], F13;\n",
+            j1c28 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 3)>], F23;\n",
+
+            j1c29 => "--:-:-:-:1  \@P0 FADD tb1, TF10, TF12;\n" .
+                     "--:-:-:-:1  \@P0 FADD tb2, TF20, TF22;\n",
+
+            j1c34 => "--:-:-:-:1  \@P0 FMUL tb1, tb1, 0.5;\n" .
+                     "--:-:-:-:1  \@P0 FMUL tb2, tb2, 0.5;\n",
+
+            j1c39 => "--:-:-:-:1  \@P0 FFMA F11, TF11,  0.5, tb1;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F12, TF11, -0.5, tb1;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F21, TF21,  0.5, tb2;\n" .
+                     "--:-:-:-:1  \@P0 FFMA F22, TF21, -0.5, tb2;\n",
+
+            j2c8  => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 1)>], F11;\n",
+            j2c10 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(1*4 + 2)>], F12;\n",
+            j2c12 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 1)>], F21;\n",
+            j2c14 => "--:-:-:-:1  \@P0 STS [writeS + 4x<512*4 + 32*(2*4 + 2)>], F22;\n",
+
+
+            j3c57 => "20:-:-:-:1  \@P1 IADD   track0.CC, track0, param_4RSKp;\n",
+            j3c62 => "--:-:-:-:1  \@P1 IADD.X track1,    track1, RZ;\n",
+        ),
+
+        j2c62 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readIs, readIs, 4x<512*4*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readFs, readFs, 4x<512*4*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<512*4*2>;\n",
+
+        j3c63 => "--:-:-:Y:5  \@P0 BRA.U FILTER_LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 3)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 4;
+        my $rsPred   = $j == 3 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*512 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*512 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+END_LOOP:
+--:-:1:-:1      S2R tid,   SR_TID.X;
+--:-:2:-:1      S2R idx_N, SR_CTAID.Z;
+--:-:3:-:1      S2R idx_K, SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      LOP.AND tid_31, tid, 31;
+
+--:-:-:-:1      BFE idx_n, idx_nkpq, 0x041c;
+--:-:-:-:1      BFE idx_k, idx_nkpq, 0x0418;
+--:-:-:-:1      BFE idx_P, idx_nkpq, 0x0c0c;
+--:-:-:-:1      BFE idx_Q, idx_nkpq, 0x0c00;
+
+02:-:-:-:1      XMAD idx_N, idx_N, param_n, idx_n;
+04:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+[+
+    our $bsum; return $bsum ? q{
+--:-:-:-:1      XMAD      bsum_offset, idx_Q, param_gridN,   idx_N;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_P, param_gridQN,  bsum_offset;
+    } : '';
++]
+
+// x = grid_x << shiftX
+// y = grid_y << shiftY
+--:-:-:-:1      SHL p, idx_P, param_shiftP;
+--:-:-:-:1      SHL q, idx_Q, param_shiftQ;
+
+// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
+--:-:-:-:1      BFE.U32 superP, tid, param_superP;
+--:-:-:-:1      BFE.U32 superQ, tid, param_superQ;
+--:-:-:-:1      ISCADD p, superP,  p, 1;
+--:-:-:-:1      ISCADD q, superQ,  q, 1;
+
+
+--:-:-:-:1      LOP.AND superN, tid, param_superN;
+--:-:-:-:1      SHL  n, idx_N, param_shiftN;
+--:-:-:-:1      IADD n, n, superN;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV32I one, 1.0;
+
+// readFs = ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid_1,  tid,    1;
+--:-:-:-:1      LOP.AND  readFs, tid,    8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      LOP.OR   readFs, readFs,  tid_1;
+//--:-:-:-:1      SHL      readFs, readFs, 3;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2)
+--:-:-:-:1      LOP.AND  tid_16, tid,   -16;
+--:-:-:-:1      SHR.U32  tid_16, tid_16, 1;
+--:-:-:-:1      BFE.U32  readIs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid_16;
+--:-:-:-:1      ISCADD   readIs, readFs, readIs, 2;
+--:-:-:-:1      SHL      readIs, readIs, 4;
+
+// writeCs = readFs * 512 + readIs;
+--:-:-:-:1      ISCADD  writeCs, readFs, readIs, 12;
+
+// readCs = tid32 * 512 + tid_31 + tid_64 * 16
+--:-:-:-:1      SHR.U32 tid32, tid,  5;
+--:-:-:-:1      SHR.U32 tid64, tid,  6;
+--:-:-:-:1      ISCADD  readCs, tid32, tid_31, 9;
+--:-:-:-:1      ISCADD  readCs, tid64, readCs, 4;
+--:-:-:-:1      SHL     readCs, readCs, 2;
+
+// k = idx_K*32 + tid32<<1
+--:-:-:-:1      SHL tid32, tid32, 1;
+--:-:-:-:1      ISCADD  k, idx_K, tid32, 5;
+
+// Out00 = k*PQN + p*QN + q*N + n
+// Out01 = Out00 + N
+// Out10 = Out00 + QN
+// Out11 = Out01 + QN
+--:-:-:-:1      XMAD      out_offset, q, 1x<$N>,    n;
+--:-:-:-:1      XMAD.LO2C out_offset, p, param_QN,  out_offset;
+--:-:-:-:1      XMAD.LO2C out_offset, k, param_PQN, out_offset;
+
+
+--:-:-:-:1      MOV  PQN15, param_PQN;
+--:-:-:-:1      SHL  PQN15, PQN15, 4;
+--:-:-:-:1      IADD PQN15, PQN15, -param_PQN;
+
+--:-:-:-:1      IADD q2, q, 1;
+--:-:-:-:1      IADD p2, p, 1;
+
+
+--:-:-:-:1      ISETP.EQ.AND P6, PT, RZ, param_flags, PT; // ! no-op
+--:-:-:-:1      ISETP.LT.AND P6, PT, n,  1x<$N>,  P6; // n < N
+--:-:-:-:1      ISETP.LT.AND P2, PT, p,  param_P, PT; // p0 < P && n < N
+--:-:-:-:1      ISETP.LT.AND P3, PT, q,  1x<$Q>,  PT; // q0 < Q && n < N
+--:-:-:-:1      ISETP.LT.AND P4, PT, p2, param_P, PT; // p1 < P && n < N
+--:-:-:-:1      ISETP.LT.AND P5, PT, q2, 1x<$Q>,  PT; // q1 < Q && n < N
+
+--:-:-:-:1      PSETP.AND.AND P0, PT, P2, P3, P6; // p0 && q0
+--:-:-:-:1      PSETP.AND.AND P1, PT, P2, P5, P6; // p0 && q1
+--:-:-:-:1      PSETP.AND.AND P2, PT, P4, P3, P6; // p1 && q0
+--:-:-:-:1      PSETP.AND.AND P3, PT, P4, P5, P6; // p1 && q1
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+--:-:-:-:1      ISETP.EQ.AND P6, PT, tid_31, RZ, PT; // tid31 == 0
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, cx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, cx7y2, alpha;
+
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 1;
+--:-:-:-:0      IADD out_offset, out_offset, param_PQN;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 15;
+--:-:-:-:0      IADD out_offset, out_offset, PQN15;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y4, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y6, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 1;
+--:-:-:-:0      IADD out_offset, out_offset, param_PQN;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+
+<SCHEDULE_BLOCK>
+11:-:-:-:1      ISETP.LT.AND P4, PT, k, 1x<$K>, PT; // k < K
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+[+
+    our ($beta, $brelu, $bprelu, $dsize, $dshift, $dtype, $Q, $N);
+    return $beta || $brelu || $bprelu ? qq{
+--:-:-:-:1      LEA      Out0.CC, out_offset, param_X[0],     $dshift;
+--:-:-:-:1      LEA.HI.X Out1,    out_offset, param_X[1], RZ, $dshift;
+
+--:-:-:-:1 \@!P0 MOV b00, RZ;
+--:-:-:-:1 \@!P1 MOV b01, RZ;
+--:-:-:-:1 \@!P2 MOV b10, RZ;
+--:-:-:-:1 \@!P3 MOV b11, RZ;
+<ORDERED>
+--:-:-:-:1  \@P0 LDG.E.CI.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>];
+--:-:5:-:1  \@P1 LDG.E.CI.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CI.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>];
+--:-:6:-:1  \@P3 LDG.E.CI.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>];
+</ORDERED>
+    } : '';
++]
+[+
+    our $bias; return $bias ? q{
+// sum = S + k
+20:-:-:-:1      LEA      Sum0.CC, k, param_S[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    k, param_S[1], RZ, 2;
+
+--:-:-:-:1 @!P4 MOV b00, RZ;
+--:-:5:-:1  @P4 LDG.E.CI b00, [Sum];
+    } : '';
++]
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      LDS m00, [readCs + 4x< 0*32>];
+--:-:-:-:1      LDS m01, [readCs + 4x< 1*32>];
+--:-:-:-:1      LDS m02, [readCs + 4x< 2*32>];
+--:-:1:Y:1      LDS m03, [readCs + 4x< 3*32>];
+
+--:-:-:-:1      LDS m10, [readCs + 4x< 4*32>];
+--:-:-:-:1      LDS m11, [readCs + 4x< 5*32>];
+--:-:-:-:1      LDS m12, [readCs + 4x< 6*32>];
+--:-:2:Y:1      LDS m13, [readCs + 4x< 7*32>];
+
+--:-:-:-:1      LDS m20, [readCs + 4x< 8*32>];
+--:-:-:-:1      LDS m21, [readCs + 4x< 9*32>];
+--:-:-:-:1      LDS m22, [readCs + 4x<10*32>];
+--:-:3:Y:1      LDS m23, [readCs + 4x<11*32>];
+
+--:-:-:-:1      LDS m30, [readCs + 4x<12*32>];
+--:-:-:-:1      LDS m31, [readCs + 4x<13*32>];
+--:-:-:-:1      LDS m32, [readCs + 4x<14*32>];
+--:-:4:Y:1      LDS m33, [readCs + 4x<15*32>];
+
+<SCHEDULE_BLOCK>
+// t00 = m00+m01+m02;
+// t01 = m01-m02-m03;
+01:-:-:-:1      FADD t00, m00,  m01;
+--:-:-:-:1      FADD t00, t00,  m02;
+--:-:-:-:1      FADD t01, m01, -m02;
+--:-:-:-:1      FADD t01, t01, -m03;
+// t10 = m10+m11+m12;
+// t11 = m11-m12-m13;
+02:-:-:-:1      FADD t10, m10,  m11;
+--:-:-:-:1      FADD t10, t10,  m12;
+--:-:-:-:1      FADD t11, m11, -m12;
+--:-:-:-:1      FADD t11, t11, -m13;
+// t20 = m20+m21+m22;
+// t21 = m21-m22-m23;
+04:-:-:-:1      FADD t20, m20,  m21;
+--:-:-:-:1      FADD t20, t20,  m22;
+--:-:-:-:1      FADD t21, m21, -m22;
+--:-:-:-:1      FADD t21, t21, -m23;
+// t30 = m30+m31+m32;
+// t31 = m31-m32-m33;
+08:-:-:-:1      FADD t30, m30,  m31;
+--:-:-:-:1      FADD t30, t30,  m32;
+--:-:-:-:1      FADD t31, m31, -m32;
+--:-:-:-:1      FADD t31, t31, -m33;
+// y00 = t00+t10+t20;
+// y01 = t01+t11+t21;
+--:-:-:-:1      FADD s00, t00,  t10;
+--:-:-:-:1      FADD s00, s00,  t20;
+--:-:-:-:1      FADD s01, t01,  t11;
+--:-:-:-:1      FADD s01, s01,  t21;
+// y10 = t10-t20-t30;
+// y11 = t11-t21-t31;
+--:-:-:-:1      FADD s10, t10, -t20;
+--:-:-:-:1      FADD s10, s10, -t30;
+--:-:-:-:1      FADD s11, t11, -t21;
+--:-:-:-:3      FADD s11, s11, -t31;
+
+[+
+    our $bias; return $bias ? q{
+10:-:-:-:1  @P0 FADD s00, s00, b00;
+--:-:-:-:1  @P1 FADD s01, s01, b00;
+--:-:-:-:1  @P2 FADD s10, s10, b00;
+--:-:-:-:1  @P3 FADD s11, s11, b00;
+    } : '';
++]
+[+
+    our $relu; return $relu ? q{
+// maximum(x, 0) + slope * minimum(0, x)
+--:-:-:-:1  @P0 FMNMX s00, s00, RZ, !PT;
+--:-:-:-:1  @P1 FMNMX s01, s01, RZ, !PT;
+--:-:-:-:1  @P2 FMNMX s10, s10, RZ, !PT;
+--:-:-:-:1  @P3 FMNMX s11, s11, RZ, !PT;
+    } : '';
++]
+[+
+    our $prelu; return $prelu ? q{
+// maximum(x, 0) + slope * minimum(0, x)
+--:-:-:-:1  @P0 FMNMX b00, s00, RZ, !PT;
+--:-:-:-:1  @P1 FMNMX b01, s01, RZ, !PT;
+--:-:-:-:1  @P2 FMNMX b10, s10, RZ, !PT;
+--:-:-:-:1  @P3 FMNMX b11, s11, RZ, !PT;
+
+--:-:-:-:1  @P0 FMNMX x00, s00, RZ, PT;
+--:-:-:-:1  @P1 FMNMX x01, s01, RZ, PT;
+--:-:-:-:1  @P2 FMNMX x10, s10, RZ, PT;
+--:-:-:-:1  @P3 FMNMX x11, s11, RZ, PT;
+
+--:-:-:-:1  @P0 FFMA s00, x00, param_beta, b00;
+--:-:-:-:1  @P1 FFMA s01, x01, param_beta, b01;
+--:-:-:-:1  @P2 FFMA s10, x10, param_beta, b10;
+--:-:-:-:1  @P3 FFMA s11, x11, param_beta, b11;
+    } : '';
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    our ($beta, $brelu, $bprelu, $convert_in);
+    return $convert_in && ($beta || $brelu || $bprelu) ? qq{
+10:-:1:-:1  \@P0 $convert_in b00, b00;
+--:-:2:-:1  \@P1 $convert_in b01, b01;
+20:-:3:-:1  \@P2 $convert_in b10, b10;
+--:-:4:-:1  \@P3 $convert_in b11, b11;
+    } : '';
++]
+[+
+    our $beta; return $beta ? q{
+11:-:-:-:1  @P0 FFMA s00, b00, param_beta, s00;
+02:-:-:-:1  @P1 FFMA s01, b01, param_beta, s01;
+24:-:-:-:1  @P2 FFMA s10, b10, param_beta, s10;
+08:-:-:-:1  @P3 FFMA s11, b11, param_beta, s11;
+    } : '';
++]
+[+
+    our $brelu; return $brelu ? q{
+//delta *= x > 0
+11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
+02:-:-:-:1      FSETP.GT.AND P1, PT, b01, RZ, PT;
+24:-:-:-:1      FSETP.GT.AND P2, PT, b10, RZ, PT;
+08:-:-:-:1      FSETP.GT.AND P3, PT, b11, RZ, PT;
+--:-:-:-:1 @!P0 MOV s00, RZ;
+--:-:-:-:1 @!P1 MOV s01, RZ;
+--:-:-:-:1 @!P2 MOV s10, RZ;
+--:-:-:-:1 @!P3 MOV s11, RZ;
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:5 @!P4 R2P PR, RZ, 0x0f;
+    } : '';
++]
+[+
+    our $bprelu; return $bprelu ? q{
+//delta *= ((x > 0) + slope * (x < 0))
+11:-:-:-:1      FSETP.GT.AND P0, PT, b00, RZ, PT;
+02:-:-:-:1      FSETP.GT.AND P1, PT, b01, RZ, PT;
+24:-:-:-:1      FSETP.GT.AND P2, PT, b10, RZ, PT;
+08:-:-:-:1      FSETP.GT.AND P3, PT, b11, RZ, PT;
+--:-:-:-:1      SEL x00, one, RZ, P0;
+--:-:-:-:1      SEL x01, one, RZ, P1;
+--:-:-:-:1      SEL x10, one, RZ, P2;
+--:-:-:-:1      SEL x11, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b00, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b01, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b10, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b11, RZ, PT;
+--:-:-:-:1      SEL b00, one, RZ, P0;
+--:-:-:-:1      SEL b01, one, RZ, P1;
+--:-:-:-:1      SEL b10, one, RZ, P2;
+--:-:-:-:1      SEL b11, one, RZ, P3;
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+--:-:-:-:1      FFMA b00, b00, param_beta, x00;
+--:-:-:-:1      FFMA b01, b01, param_beta, x01;
+--:-:-:-:1      FFMA b10, b10, param_beta, x10;
+--:-:-:-:1      FFMA b11, b11, param_beta, x11;
+--:-:-:-:1      FMUL s00, s00, b00;
+--:-:-:-:1      FMUL s01, s01, b01;
+--:-:-:-:1      FMUL s10, s10, b10;
+--:-:-:-:1      FMUL s11, s11, b11;
+    } : '';
++]
+[+
+    our $bsum; return $bsum ? q{
+--:-:-:-:1      MOV sum0, RZ;
+--:-:-:-:1  @P0 FADD sum0, s00, sum0;
+--:-:-:-:1  @P1 FADD sum0, s01, sum0;
+--:-:-:-:1  @P2 FADD sum0, s10, sum0;
+--:-:-:-:1  @P3 FADD sum0, s11, sum0;
+    } : '';
++]
+</SCHEDULE_BLOCK>
+
+[+
+    our $convert_out;
+    return $convert_out ? qq{
+--:-:1:-:1      $convert_out s00, s00;
+--:-:2:-:1      $convert_out s01, s01;
+--:-:3:-:1      $convert_out s10, s10;
+--:-:4:-:1      $convert_out s11, s11;
+    } : '';
++]
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out0.CC, out_offset, param_O[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X Out1,    out_offset, param_O[1], RZ, [+ dshift() +];
+
+// k < K && R2P && output
+01:-:-:-:1  @P0 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 0*$N>], s00;
+02:-:-:-:1  @P1 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<0*$Q*$N + 1*$N>], s01;
+04:-:-:-:1  @P2 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 0*$N>], s10;
+08:1:-:-:1  @P3 STG.E.CG.[+ dtype() +] [Out + [+ dsize() +]x<1*$Q*$N + 1*$N>], s11;
+</SCHEDULE_BLOCK>
+
+[+
+    our $bsum;
+    return $bsum ? q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      XMAD.LO2C b00, k, param_gridPQN, bsum_offset;
+
+--:-:-:-:1      LEA      Sum0.CC, b00, param_S[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    b00, param_S[1], RZ, 2;
+
+--:-:-:-:1      PSETP.AND.AND P5, PT, P4, P6, PT; // k < K && tid31 == 0
+
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  1, 0x1f;
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  2, 0x1f;
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  4, 0x1f;
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  8, 0x1f;
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 16, 0x1f;
+10:-:-:-:2      FADD sum0, sum1, sum0;
+
+--:5:-:-:1  @P5 STG.E.CG [Sum], sum0;
+</SCHEDULE_BLOCK>
+    } : '';
++]
+
+--:-:-:-:5      RET;
+
+
+
+//     T0 = np.empty((4,4))
+//     T1 = np.empty((4,4))
+//
+//     for O, I in ((T0, I), (T1, T0.T)):
+//
+//         O[0,:] = I[0,:] - I[2,:]
+//         O[1,:] = I[1,:] + I[2,:]
+//         O[2,:] = I[2,:] - I[1,:]
+//         O[3,:] = I[1,:] - I[3,:]
+//
+//     Iw[:] = T1.T
+//
+// 0  = i00
+// 1  = i01
+// 2  = i02
+// 3  = i03
+// 4  = i30
+// 5  = i31
+// 6  = i32
+// 7  = i33
+// 8  = i13
+// 9  = i12
+// 10 = i11
+// 11 = i10
+// 12 = i23, TI23, I23
+// 13 = i22, TI22
+// 14 = i21, TI21
+// 15 = i20, TI20, I20
+// 16 = TI00, I00, TI10, I10, I21, I01
+// 17 = TI01, I11
+// 18 = TI02, I12
+// 19 = TI03, I03, TI11, I31
+// 20 = TI30, I30, TI12, I32
+// 21 = TI31
+// 22 = TI32
+// 23 = TI33, I33, TI13, I13, I22, I02
+//
+//
+// TI00 = i00 - i20
+// TI01 = i01 - i21
+// TI02 = i02 - i22
+// TI03 = i03 - i23
+// # load 0
+//
+// TI30 = i10 - i30
+// TI31 = i11 - i31
+// TI32 = i12 - i32
+// TI33 = i13 - i33
+// # load 3
+//
+// I00 = TI00 - TI02
+// I03 = TI01 - TI03
+// I30 = TI30 - TI32
+// I33 = TI31 - TI33
+// # store 0
+//
+// # wait 0
+// TI10 = i10 + i20
+// TI11 = i11 + i21
+// TI12 = i12 + i22
+// TI13 = i13 + i23
+//
+// TI20 = i20 - i10
+// TI21 = i21 - i11
+// TI22 = i22 - i12
+// TI23 = i23 - i13
+//
+// #load 1
+//
+// I10 = TI10 - TI12
+// I20 = TI20 - TI22
+// I13 = TI11 - TI13
+// I23 = TI21 - TI23
+// # store 1
+//
+// # wait 1
+// I21 = TI21 + TI22
+// I22 = TI22 - TI21
+// # store 2
+//
+// # load 2
+//
+// # wait 2
+// I01 = TI01 + TI02
+// I02 = TI02 - TI01
+// I11 = TI11 + TI12
+// I12 = TI12 - TI11
+// I31 = TI31 + TI32
+// I32 = TI32 - TI31
+// #store 3
+
+
+
+//     T0 = np.empty((4,3))
+//     T1 = np.empty((4,4))
+//
+//     for O, I in ((T0, F), (T1, T0.T)):
+//
+//         t0 = (I[0,:] + I[2,:])*0.5
+//
+//         O[0,:] = I[0,:]
+//         O[1,:] = t0 + I[1,:]*0.5
+//         O[2,:] = t0 - I[1,:]*0.5
+//         O[3,:] = I[2,:]
+//
+//     Fw[:] = T1.T
+//
+// 0  = f00, TF00, F00
+// 1  = f01, TF01
+// 2  = f02, TF02, F03
+// 3  = f10
+// 4  = f11
+// 5  = f12
+// 6  = f20, TF30, F30
+// 7  = f21, TF31
+// 8  = f22, TF32, F33
+// 9  = tb3, F32
+// 10 = tb0, F02
+// 11 = ta2, TF22, F23
+// 12 = ta0, TF20, F20
+// 13 = ta1, TF21
+// 14 = F01
+// 15 = F31
+// 16 = TF10, F10
+// 17 = TF11
+// 18 = TF12, F13
+// 19 = tb1, F12
+// 20 = tb2, F22
+// 21 = F11
+// 22 = F21
+// 23 =
+//
+//
+// TF00 = f00
+// TF01 = f01
+// TF02 = f02
+// TF30 = f20
+// TF31 = f21
+// TF32 = f22
+//
+// F00 = TF00
+// F03 = TF02
+// F30 = TF30
+// F33 = TF32
+//
+// # store 0
+//
+// tb0 = TF00 + TF02
+// tb3 = TF30 + TF32
+// ta0 = f00 + f20
+// ta1 = f01 + f21
+// ta2 = f02 + f22
+//
+// tb0 = tb0 * 0.5
+// tb3 = tb3 * 0.5
+// ta0 = ta0 * 0.5
+// ta1 = ta1 * 0.5
+// ta2 = ta2 * 0.5
+//
+// F01 = tb0 + TF01*0.5
+// F02 = tb0 - TF01*0.5
+// F31 = tb3 + TF31*0.5
+// F32 = tb3 - TF31*0.5
+//
+// # wait 0
+// # load 0, 2
+// # store 1
+//
+// TF10 = ta0 + f10*0.5
+// TF20 = ta0 - f10*0.5
+// TF11 = ta1 + f11*0.5
+// TF21 = ta1 - f11*0.5
+// TF12 = ta2 + f12*0.5
+// TF22 = ta2 - f12*0.5
+//
+// # load 1
+//
+// F10 = TF10
+// F20 = TF20
+// F13 = TF12
+// F23 = TF22
+//
+// # store 2
+//
+// tb1 = TF10 + TF12
+// tb2 = TF20 + TF22
+// tb1 = tb1 * 0.5
+// tb2 = tb2 * 0.5
+//
+// F11 = tb1 + TF11*0.5
+// F12 = tb1 - TF11*0.5
+// F21 = tb2 + TF21*0.5
+// F22 = tb2 - TF21*0.5
+//
+// # store 3//
\ No newline at end of file
diff --git a/Kernel/Convolution/Pascal/xconv_winograd_2x2_5x5_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_2x2_5x5_32x32.sass
new file mode 100644
index 0000000..0fcb767
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_winograd_2x2_5x5_32x32.sass
@@ -0,0 +1,1589 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $type;
+our $dtype       = $type eq 'h' ?         'U16' :  '32';
+our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' :    '';
+our $convert_out = $type eq 'h' ? 'F2F.F16.F32' :    '';
+our $dshift      = $type eq 'h' ?           '1' :   '2';
+our $dsize       = $type eq 'h' ?           '2' :   '4';
+our $vsize       = $type eq 'h' ?          '64' : '128';
+sub dtype  { return $dtype;  }
+sub dsize  { return $dsize;  }
+sub dshift { return $dshift; }
+sub vsize  { return $vsize;  }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero   : 4x<32*36*2*4 + 64 + 0>
+    addr_idx_Y  : 4x<32*36*2*4 + 64 + 4>
+    addr_idx_X  : 4x<32*36*2*4 + 64 + 5>
+    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>
+
+    param_O[0]           : c[0x0][0x140]
+    param_O[1]           : c[0x0][0x144]
+    param_I[0]           : c[0x0][0x148]
+    param_I[1]           : c[0x0][0x14c]
+    param_F[0]           : c[0x0][0x150]
+    param_F[1]           : c[0x0][0x154]
+    param_alpha          : c[0x0][0x158]
+    param_flags          : c[0x0][0x15c]
+    param_C              : c[0x0][0x160]
+    param_K              : c[0x0][0x164]
+    param_N              : c[0x0][0x168]
+    param_H              : c[0x0][0x16c]
+    param_W              : c[0x0][0x170]
+    param_HWN            : c[0x0][0x174]
+    param_WN             : c[0x0][0x178]
+    param_Y2             : c[0x0][0x17c]
+    param_GX             : c[0x0][0x180]
+    param_Xk             : c[0x0][0x184]
+    param_k              : c[0x0][0x188]
+    param_magic_Xk       : c[0x0][0x18c]
+    param_shift_Xk       : c[0x0][0x190]
+    param_magic_k        : c[0x0][0x194]
+    param_shift_k        : c[0x0][0x198]
+    param_P              : c[0x0][0x19c]
+    param_Q              : c[0x0][0x1a0]
+    param_QN             : c[0x0][0x1a4]
+    param_PQN            : c[0x0][0x1a8]
+    param_PQNp           : c[0x0][0x1ac]
+    param_PQN15p         : c[0x0][0x1b0]
+    param_shiftY         : c[0x0][0x1b4]
+    param_shiftX         : c[0x0][0x1b8]
+    param_shiftN         : c[0x0][0x1bc]
+    param_superY         : c[0x0][0x1c0]
+    param_superX         : c[0x0][0x1c4]
+    param_superN         : c[0x0][0x1c8]
+    param_SuperY         : c[0x0][0x1cc]
+    param_SuperX         : c[0x0][0x1d0]
+    param_SuperN         : c[0x0][0x1d4]
+    param_pad_x          : c[0x0][0x1d8]
+    param_pad_y          : c[0x0][0x1dc]
+    param_HWN2p          : c[0x0][0x1e0]
+    param_C_1152         : c[0x0][0x1e4]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+      // Image Transform
+      52 = i00, TI00, I00
+      53 = i10, TI50, I50
+      54 = i01, TI01, I05
+      55 = i11, TI51, I55
+      56 = TI10, I10
+      57 = TI20, I20
+      58 = TI30, I30
+      59 = TI40, I40
+      60 = TI41, I45
+      61 = TI31, I35
+      62 = TI21, I25
+      63 = TI11, I15
+      64-67 : I0<1-4>
+      68-71 : I5<1-4>
+      72-75 : I1<1-4>
+      76-79 : I2<1-4>
+      80-83 : I3<1-4>
+      84-87 : I4<1-4>
+
+      // Filter Transform
+      52-87 : F0<0-3>, F1<0-3>, F2<0-3>, F3<0-3>, F4<0-3>, F5<0-3>, F6<0-3>, F7<0-3>, F8<0-3>
+
+      // Load Loop Registers
+     3, 2,11,10 : clx<0-3>y0
+     7, 6,15,14 : clx<0-3>y1
+     1, 0, 9, 8 : clx<0-3>y2
+     5, 4,13,12 : clx<0-3>y3
+    19,18,27,26 : clx<0-3>y4
+    23,22,31,30 : clx<0-3>y5
+    17,16,25,24 : clx<0-3>y6
+    21,20,29,28 : clx<0-3>y7
+
+      32-43 : jl0Ix<0-3>, jl0Fy<0-7>
+      44-51 : jl1Ix<0-3>, jl1Fy<4-7>
+      36-39 : jl1Fy<0-3>
+
+      32-51 ~ partialC, c, idx_K, idx_Y, idx_X, idx_N, tid31, gx, gy, offset, nn, x1, x2, y1, mask_x
+      52-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, idx_Y2, idx_X2, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, super_x, super_y
+         87 = tid
+
+     // Compute Loop Registers
+     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
+     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
+     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
+    35,34,43,42,51,50,59,58 : ccx<0-7>y4
+    39,38,47,46,55,54,63,62 : ccx<0-7>y5
+    33,32,41,40,49,48,57,56 : ccx<0-7>y6
+    37,36,45,44,53,52,61,60 : ccx<0-7>y7
+
+      64-79 : jc0Ix<0-7>, jc0Fy<0-7>
+      80-91 : jc1Ix<4-7>, jc1Fy<0-7>
+      64-67 : jc1Ix<0-3>
+
+      64-86 ~ tid16, tid_1, tid128
+
+      // Shared Registers
+      88-89 : track<0-1>
+      92-95 ~ C, swapBuf, readFs, readIs
+      90-91 ~ writeS, preds
+
+      // Load Loop Finish
+      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
+      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16
+
+      // Compute Loop Finish
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+      64-87 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, four, z<1-5>, mask_q, offsetO, sign
+      90-95 ~ writeCs, readCs, k, pred30, pred36, tid31_4
+      88-89 : Out<0-1>
+
+      3, 2,11,10,19,18 : m<0-5>0
+     27, 1,26, 0, 9, 8 : m<0-5>1
+     16,17,24,25,64,65 : m<0-5>2
+     66,67,68,69,70,71 : m<0-5>3
+     72,73,74,75,76,77 : m<0-5>4
+     78,79,80,81,82,83 : m<0-5>5
+
+      3, 2,11,10,19,18 : w<0-5>0
+     27, 1,26, 0, 9, 8 : w<0-5>1
+     16,17,24,25,64,65 : w<0-5>2
+     66,67,68,69,70,71 : w<0-5>3
+     72,73,74,75,76,77 : w<0-5>4
+     78,79,80,81,82,83 : w<0-5>5
+
+      3, 2,11,10,19,18 : s<0-5>0
+     27, 1,26, 0, 9, 8 : s<0-5>1
+     16,17,24,25,64,65 : s<0-5>2
+     66,67,68,69,70,71 : s<0-5>3
+     72,73,74,75,76,77 : s<0-5>4
+     78,79,80,81,82,83 : s<0-5>5
+
+           85,84,86,87 : t<0-3>0
+           85,87,84,86 : t<0-3>1
+           85,84,87,86 : t<0-3>2
+           85,84,87,86 : t<0-3>3
+           85,84,87,86 : t<0-3>4
+           85,84,87,86 : t<0-3>5
+           85,84,87,86 : r0<0-3>
+           85,84,87,86 : r1<0-3>
+           85,87,86,84 : r2<0-3>
+           84,85,86,87 : r3<0-3>
+           85,84,87,86 : r4<0-3>
+           84,85,87,86 : r5<0-3>
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV C,   param_C;
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:-:-:1      MOV swapBuf, 4x<32*36*2*2>;
+01:-:-:-:0      ISETP.GE.AND P0, PT, tid, 128, PT;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+--:-:-:Y:c      LOP.AND partialC, C, 1;
+--:-:-:-:0      IADD C, C, partialC;
+--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;
+
+##############################################################
+LOAD_SETUP:
+
+--:-:1:-:1      S2R idx_YXk, SR_CTAID.X;
+--:-:2:-:1      S2R idx_K,   SR_CTAID.Y;
+--:-:3:-:1      S2R idx_N,   SR_CTAID.Z;
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]
+
+--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;
+
+// idx_Y2   = idx_YXk / blk_Xk
+--:-:-:-:1      MOV  magic_Xk, param_magic_Xk;
+--:-:-:-:1      IADD negXk, RZ, -param_Xk;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_Xk, 1, PT;
+01:-:-:-:1  @P3 XMAD     div1, idx_YXk,    magic_Xk,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, idx_YXk,    magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, idx_YXk.H1, magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk,    div1;
+--:-:-:-:1  @P3 IADD3.RS idx_Y2, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  idx_Y2, idx_Y2,  param_shift_Xk;
+--:-:-:-:1 @!P3 SHR.U32  idx_Y2, idx_YXk, param_shift_Xk;
+
+// idx_Xk  = idx_YXk % blk_Xk
+--:-:-:-:1      XMAD.LO2 idx_Xk, negXk, idx_Y2, idx_YXk;
+
+// idx_X2   = idx_Xk / blk_k
+// idx_k   = idx_Xk % blk_k
+--:-:-:-:1      XMAD    idx_X2,  idx_Xk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_X2,  idx_X2, param_shift_k;
+--:-:-:-:1      XMAD    idx_k,   idx_X2, param_k, RZ;
+--:-:-:-:1      IADD    idx_k,  -idx_k,  idx_Xk;
+
+// idx_K = idx_K * blk_k + idx_k
+02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+//--:-:-:-:1      MOV idx_X, idx_X2;
+//--:-:-:-:1      MOV idx_Y, idx_Y2;
+
+// gx = x2
+// gy = y2 * 2
+--:-:-:-:1      MOV idx_X, idx_X2;
+--:-:-:-:1      SHL idx_Y, idx_Y2, 1;
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// if y2 != Y2:
+//     gy += (gx&1) ^ ((gx&2)>>1)
+//     gx /= 2
+--:-:-:-:1      ISETP.NE.AND P4, PT, idx_Y2, param_Y2, PT;
+--:-:-:-:1  @P4 LOP.AND x1, idx_X, 1;
+--:-:-:-:1  @P4 BFE.U32 x2, idx_X, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P4 LOP.XOR x1, x1, x2;
+--:-:-:-:1  @P4 IADD idx_Y, idx_Y, x1;
+--:-:-:-:1  @P4 SHR.U32 idx_X, idx_X, 1;
+
+// Scan backwards on odd rows
+// if y2 & 1:
+//     gx = gridX - gx - 1
+--:-:-:-:1      LOP.AND.NZ P5, RZ, idx_Y2, 1;
+--:-:-:-:1  @P5 IADD idx_X, -idx_X,  param_GX;
+--:-:-:-:1  @P5 IADD idx_X,  idx_X, -1;
+
+--:-:-:-:1  @P0 STS [addr_idx_Y], idx_Y;
+--:-:-:-:1  @P0 STS [addr_idx_X], idx_X;
+--:-:-:-:1  @P0 STS [addr_idx_K], idx_K;
+
+// x = gx << shiftX
+// y = gy << shiftY
+--:-:-:-:1      SHL gx, idx_X, param_shiftX;
+--:-:-:-:1      SHL gy, idx_Y, param_shiftY;
+
+// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
+--:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superY;
+--:-:-:-:1      ISCADD gx, super_x,  gx, 1;
+--:-:-:-:1      ISCADD gy, super_y,  gy, 1;
+
+--:-:-:-:1      LOP.AND  tid32_2,  tid,   -32;
+--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid32_2;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD3   readFs, readFs, tid1, tid32_2;
+--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;
+
+// c = (tid & 32) >> 5
+--:-:-:-:1      BFE.U32 c, tid, 0x105; // 1 bits at position 5
+
+// P6 = c == partialC == 1
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, partialC, P6;
+
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+</SCHEDULE_BLOCK>
+
+04:-:-:-:5  @P1 BRA.U FILTER_SETUP;
+
+##############################################################
+IMAGE_SETUP:
+
+<SCHEDULE_BLOCK>
+
+// writeS = c*32*36 + tid31
+--:-:-:-:1      XMAD writeS, c, 1152, tid31;
+--:-:-:-:1      SHL  writeS, writeS, 2;
+
+--:-:-:-:1      STS [writeS + 4x<32*0>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*1>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*2>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*3>], RZ;
+
+// n = idx_N<<shiftN + tid & superN
+--:-:-:-:1      SHL idx_N, idx_N, param_shiftN;
+--:-:-:-:1      LOP.AND nn, tid,  param_superN;
+--:-:-:-:1      IADD    nn, nn, idx_N;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P4, PT, nn, param_N, PT;
+
+// offset = c*YXN + y0*XN + x0*N + n;
+--:-:-:-:1      XMAD.S16.U16      offset, gx, param_N,   nn;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, gy, param_WN,  offset;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, c,  param_HWN, offset;
+
+--:-:-:-:1      LEA      track0.CC, offset, param_I[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X track1,    offset, param_I[1], RZ, [+ dshift() +];
+
+--:-:-:-:1      IADD x1, gx, 1;
+--:-:-:-:1      ISETP.LT.AND P0, PT, gx, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_W, PT;
+--:-:-:-:1      ISETP.GE.AND P0, PT, gx, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x3;
+
+--:-:-:-:1      IADD y1, gy, 1;
+--:-:-:-:1      ISETP.LT.AND P2, PT, gy, param_H, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y1, param_H, P4;
+--:-:-:-:1      ISETP.GE.AND P2, PT, gy, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, y1, RZ, P3;
+
+--:-:-:-:1      SEL preds, mask_x, RZ, P2;
+--:-:-:-:1  @P3 BFI preds, mask_x, 0x202, preds;
+
+
+--:-:-:-:1      XMAD partialC, partialC, param_HWN, RZ;
+--:-:-:-:1      SHL  partialC, partialC, [+ dshift() +];
+
+--:-:-:-:1 @!P6 R2P PR, preds, 0xf;
+--:-:-:-:1  @P6 R2P PR,    RZ, 0xf;
+
+<ORDERED>
+--:-:-:-:1 @!P0 MOV i00, RZ;
+--:-:2:-:1  @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>];
+--:-:-:-:1 @!P2 MOV i10, RZ;
+--:-:3:-:1  @P2 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>];
+--:-:-:-:1 @!P1 MOV i01, RZ;
+--:-:4:-:1  @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>];
+--:-:-:-:1 @!P3 MOV i11, RZ;
+--:6:5:-:1  @P3 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>];
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+20:-:-:-:0      IADD   track0.CC, track0, -partialC;
+
+--:-:-:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:1:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+--:-:-:-:1      IADD   writeS,    writeS,  swapBuf;
+--:-:-:-:2      IADD   swapBuf,       RZ, -swapBuf;
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+
+##############################################################
+FILTER_SETUP:
+
+<SCHEDULE_BLOCK>
+
+// writeS = (c*32*36 + (tid & 31)*4 + 32*36*2)*4
+--:-:-:-:1      ISCADD writeS, tid31, 4x<32*36*2>, 4;
+--:-:-:-:1      XMAD   writeS, c, 4x<32*36>, writeS;
+
+--:-:-:-:1      STS.128 [writeS], RZ;
+
+// offset = c*32*36 + tid31*4
+--:-:-:-:1      SHL tid31, tid31, 2;
+--:-:-:-:1      XMAD offset, c, 1x<32*36>, tid31;
+
+// (kBlks,C,6,6,32)
+// offset += (idx_K*C*32*36) * itemsize;
+--:-:-:-:1      XMAD.LO2C offset, idx_K, param_C_1152, offset;
+--:-:-:-:1      LEA      track0.CC, offset, param_F[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X track1,    offset, param_F[1], RZ, [+ dshift() +];
+
+--:-:-:-:1      XMAD partialC,  partialC, 1x<32*36 * $dsize>, RZ;
+
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F0, [track + 4x<0*32 * $dsize>];
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F1, [track + 4x<1*32 * $dsize>];
+--:-:2:-:1 @!P6 LDG.E.[+ vsize() +] F2, [track + 4x<2*32 * $dsize>];
+
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F0, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F1, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F2, [addr_zero];
+
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F3, [track + 4x<3*32 * $dsize>];
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F4, [track + 4x<4*32 * $dsize>];
+--:-:3:-:1 @!P6 LDG.E.[+ vsize() +] F5, [track + 4x<5*32 * $dsize>];
+
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F3, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F4, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F5, [addr_zero];
+
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F6, [track + 4x<6*32 * $dsize>];
+--:-:-:-:1 @!P6 LDG.E.[+ vsize() +] F7, [track + 4x<7*32 * $dsize>];
+--:6:4:-:1 @!P6 LDG.E.[+ vsize() +] F8, [track + 4x<8*32 * $dsize>];
+
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F6, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F7, [addr_zero];
+--:-:-:-:1  @P6 LDS.U.[+ vsize() +] F8, [addr_zero];
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+20:-:-:-:0      IADD   track0.CC, track0, -partialC;
+
+--:-:-:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:1:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+--:-:-:-:1      IADD   writeS,    writeS,  swapBuf;
+--:-:-:-:2      IADD   swapBuf,       RZ, -swapBuf;
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:5      BRA.U FILTER_LOOP;
+
+##############################################################
+
+COMPUTE_SETUP:
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+--:-:-:-:1      IADD tid128, tid, -128;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,   1;
+
+--:-:-:-:1      BFE.U32  readIs, tid128, 0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
+--:-:-:-:1      ISCADD   readIs, readIs, 4x<32*4 + 32*36*2*2>, 4;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
+--:-:-:-:1      LOP.AND  readFs, tid128, 8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      IADD3    readFs, readFs, tid16, tid_1;
+--:-:-:-:0      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2*3>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+// Let Load loop run once to transform initial load and store to shared.
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>];
+--:-:1:-:2      LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+COMPUTE_LOOP:
+[+
+    my %insert = (
+
+        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -2;\n",
+
+        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        my $odd    = $j;
+        my $nOdd   = 1 - $j;
+        my $rsPred = $j == 1 ? '@P0' : '   ';
+        my $bar    = $j == 0 ? '2' : '-';
+
+        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;
+
+        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;
+
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            #$stall = '4' if $stall && $c % 2 == 0 && $j == 0 && $c > 16;
+
+            my $yield  = ($c % 5 == 0) && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+
+IMAGE_LOOP:
+--:-:-:-:1      ISETP.GT.AND P6, PT, C,  2, PT;
+[+
+    our ($dtype, $dsize, $convert_in, $W, $N);
+    my %insert = (
+
+        j0c0  => "--:-:-:-:1      ISETP.GT.AND P5, PT, C, RZ, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -2;\n",
+
+        $convert_in ? (
+            j0c1  => "02:-:2:-:1      F2F.F32.F16 i00, i00;\n",
+            j0c2  => "04:-:3:-:1      F2F.F32.F16 i10, i10;\n",
+            j0c3  => "08:-:4:-:1      F2F.F32.F16 i01, i01;\n",
+            j0c4  => "10:-:5:-:1      F2F.F32.F16 i11, i11;\n",
+        ) : (),
+
+        j0c5  => "02:-:-:-:1      STS [writeS + 4x<32*(0*6 + 0)>], I00;\n",
+        j0c6  => "04:-:-:-:1      STS [writeS + 4x<32*(5*6 + 0)>], I50;\n",
+
+        j0c7  => "--:-:-:-:1      FFMA TI10, i10,  0.75, i00;\n" .
+                 "--:-:-:-:1      FFMA TI20, i10, -0.75, i00;\n" .
+                 "--:-:-:-:1      FFMA TI30, i10,  1.50, i00;\n" .
+                 "--:-:-:-:1      FFMA TI40, i10, -1.50, i00;\n" .
+                 "--:-:-:-:1      IADD track0.CC, track0, param_HWN2p;\n" .
+                 "--:-:-:-:1 @!P6 MOV preds, RZ;\n",
+
+        j0c8  => "08:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 5)>], I05;\n",
+        j0c9  => "10:6:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 5)>], I55;\n",
+
+        j0c10 => "--:-:-:-:0      FFMA TI11, i11,  0.75, i01;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 0)>], I10;\n" .
+                 "--:-:-:-:0      FFMA TI21, i11, -0.75, i01;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 0)>], I20;\n" .
+                 "--:-:-:-:0      FFMA TI31, i11,  1.50, i01;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 0)>], I30;\n" .
+                 "--:-:-:-:0      FFMA TI41, i11, -1.50, i01;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 0)>], I40;\n" .
+                 "--:-:-:-:1      R2P PR, preds, 0xf;\n" .
+                 "--:-:-:-:1      IADD.X track1, track1, RZ;\n",
+
+        j0c11 => "--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n",
+        j0c13 => "--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n",
+        j0c19 => "--:-:1:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+
+        j0c14 => "--:-:-:-:0      FFMA I01, TI01,  0.75, TI00;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 5)>], I15;\n" .
+                 "--:-:-:-:0      FFMA I02, TI01, -0.75, TI00;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 5)>], I25;\n" .
+                 "--:-:-:-:0      FFMA I03, TI01,  1.50, TI00;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 5)>], I35;\n" .
+                 "--:-:-:-:0      FFMA I04, TI01, -1.50, TI00;\n" .
+                 "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 5)>], I45;\n",
+
+        j0c15 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 1)>], I01;\n",
+        j0c16 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 2)>], I02;\n",
+        j0c17 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 3)>], I03;\n",
+        j0c18 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(0*6 + 4)>], I04;\n",
+
+        j0c20 => "--:-:-:-:1      FFMA I51, TI51,  0.75, TI50;\n" .
+                 "--:-:-:-:1      FFMA I52, TI51, -0.75, TI50;\n" .
+                 "--:-:-:-:1      FFMA I53, TI51,  1.50, TI50;\n" .
+                 "--:-:-:-:1      FFMA I54, TI51, -1.50, TI50;\n",
+
+        j0c21 => "20:-:2:-:1  \@P0 LDG.E.CI.$dtype i00, [track + ${dsize}x<0*$W*$N + 0*$N>];\n",
+        j0c22 => "--:-:3:-:1  \@P2 LDG.E.CI.$dtype i10, [track + ${dsize}x<1*$W*$N + 0*$N>];\n",
+        j0c23 => "--:-:4:-:1  \@P1 LDG.E.CI.$dtype i01, [track + ${dsize}x<0*$W*$N + 1*$N>];\n",
+        j0c24 => "--:-:5:-:1  \@P3 LDG.E.CI.$dtype i11, [track + ${dsize}x<1*$W*$N + 1*$N>];\n",
+
+        j0c25 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 1)>], I51;\n",
+        j0c26 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 2)>], I52;\n",
+        j0c27 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 3)>], I53;\n",
+        j0c28 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(5*6 + 4)>], I54;\n",
+
+        j0c29 => "--:-:-:-:1      FFMA I11, TI11,  0.75, TI10;\n" .
+                 "--:-:-:-:1      FFMA I12, TI11, -0.75, TI10;\n" .
+                 "--:-:-:-:1      FFMA I13, TI11,  1.50, TI10;\n" .
+                 "--:-:-:-:1      FFMA I14, TI11, -1.50, TI10;\n",
+
+        j0c30 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 1)>], I11;\n",
+        j0c31 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 2)>], I12;\n",
+        j1c0  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 3)>], I13;\n",
+        j1c1  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(1*6 + 4)>], I14;\n",
+
+        j1c2  => "--:-:-:-:1      FFMA I21, TI21,  0.75, TI20;\n" .
+                 "--:-:-:-:1      FFMA I22, TI21, -0.75, TI20;\n" .
+                 "--:-:-:-:1      FFMA I23, TI21,  1.50, TI20;\n" .
+                 "--:-:-:-:1      FFMA I24, TI21, -1.50, TI20;\n",
+
+        j1c3  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 1)>], I21;\n",
+        j1c4  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 2)>], I22;\n",
+        j1c5  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 3)>], I23;\n",
+        j1c6  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(2*6 + 4)>], I24;\n",
+
+        j1c7  => "--:-:-:-:1      FFMA I31, TI31,  0.75, TI30;\n" .
+                 "--:-:-:-:1      FFMA I32, TI31, -0.75, TI30;\n" .
+                 "--:-:-:-:1      FFMA I33, TI31,  1.50, TI30;\n" .
+                 "--:-:-:-:1      FFMA I34, TI31, -1.50, TI30;\n",
+
+        j1c8  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 1)>], I31;\n",
+        j1c9  => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 2)>], I32;\n",
+        j1c10 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 3)>], I33;\n",
+        j1c11 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(3*6 + 4)>], I34;\n",
+
+        j1c12 => "--:-:-:-:1      FFMA I41, TI41,  0.75, TI40;\n" .
+                 "--:-:-:-:1      FFMA I42, TI41, -0.75, TI40;\n" .
+                 "--:-:-:-:1      FFMA I43, TI41,  1.50, TI40;\n" .
+                 "--:-:-:-:1      FFMA I44, TI41, -1.50, TI40;\n",
+
+        j1c13 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 1)>], I41;\n",
+        j1c14 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 2)>], I42;\n",
+        j1c15 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 3)>], I43;\n",
+        j1c16 => "--:-:-:-:1  \@P5 STS [writeS + 4x<32*(4*6 + 4)>], I44;\n",
+
+        j1c17 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P5 IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P5 IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P5 IADD writeS, writeS,  swapBuf;\n" .
+                 "--:-:-:-:1  \@P5 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c18 => "--:-:-:-:1  \@P5 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n",
+        j1c20 => "--:-:-:-:1  \@P5 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n",
+        j1c22 => "--:-:1:-:1  \@P5 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n",
+
+        j1c31 => "--:-:-:Y:5  \@P5 BRA.U IMAGE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U LOAD_FINISH;",
+
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "01" : '--';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+
+FILTER_LOOP:
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, RZ, PT;
+20:-:-:-:1      IADD track0.CC, track0, 1x<32*36*2 * $dsize>;
+--:-:-:-:1      ISETP.GT.AND P1, PT, C, 2, PT;
+--:-:-:-:1      IADD C, C, -2;
+[+
+    our ($vsize, $dsize, $convert_in);
+    my %insert = (
+
+        j0c3 => "--:-:-:-:1      IADD.X track1, track1, RZ;\n",
+
+        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n",
+        j0c2  => "--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n",
+        j0c18 => "--:-:1:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+
+        j1c12 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n",
+        j1c14 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n",
+        j1c16 => "--:-:1:-:1  \@P0 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n",
+
+        $convert_in ? (
+
+            j0c1  => "02:-:-:-:1      F2F.F32.F16 F03, F01.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F02, F01.H0;\n",
+            j0c4  => "--:-:-:-:1      F2F.F32.F16 F01, F00.H1;\n" .
+                     "--:-:2:-:1      F2F.F32.F16 F00, F00.H0;\n",
+
+            j0c5  => "--:-:-:-:1      F2F.F32.F16 F13, F11.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F12, F11.H0;\n",
+            j0c6  => "--:-:-:-:1      F2F.F32.F16 F11, F10.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 F10, F10.H0;\n",
+
+            j0c7  => "--:-:-:-:1      F2F.F32.F16 F23, F21.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F22, F21.H0;\n",
+            j0c8  => "--:-:-:-:1      F2F.F32.F16 F21, F20.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 F20, F20.H0;\n",
+
+            j0c9  => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*32*4>], F0;\n",
+            j0c10 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], F1;\n",
+            j0c11 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], F2;\n",
+
+            j0c13 => "02:-:-:-:1  \@P1 LDG.E.$vsize F0, [track + 4x<0*32 * $dsize>];\n",
+            j0c14 => "10:-:-:-:1  \@P1 LDG.E.$vsize F1, [track + 4x<1*32 * $dsize>];\n",
+            j0c15 => "20:-:2:-:1  \@P1 LDG.E.$vsize F2, [track + 4x<2*32 * $dsize>];\n",
+
+            j0c16 => "04:-:-:-:1      F2F.F32.F16 F33, F31.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F32, F31.H0;\n",
+            j0c17 => "--:-:-:-:1      F2F.F32.F16 F31, F30.H1;\n" .
+                     "--:-:3:-:1      F2F.F32.F16 F30, F30.H0;\n",
+
+            j0c19 => "--:-:-:-:1      F2F.F32.F16 F43, F41.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F42, F41.H0;\n",
+            j0c20 => "--:-:-:-:1      F2F.F32.F16 F41, F40.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 F40, F40.H0;\n",
+
+            j0c21 => "--:-:-:-:1      F2F.F32.F16 F53, F51.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F52, F51.H0;\n",
+            j0c22 => "--:-:-:-:1      F2F.F32.F16 F51, F50.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 F50, F50.H0;\n",
+
+            j0c23 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], F3;\n",
+            j0c24 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], F4;\n",
+            j0c25 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], F5;\n",
+
+            j0c27 => "04:-:-:-:1  \@P1 LDG.E.$vsize F3, [track + 4x<3*32 * $dsize>];\n",
+            j0c28 => "10:-:-:-:1  \@P1 LDG.E.$vsize F4, [track + 4x<4*32 * $dsize>];\n",
+            j0c29 => "20:-:3:-:1  \@P1 LDG.E.$vsize F5, [track + 4x<5*32 * $dsize>];\n",
+
+            j0c30 => "08:-:-:-:1      F2F.F32.F16 F63, F61.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F62, F61.H0;\n",
+            j0c31 => "--:-:-:-:1      F2F.F32.F16 F61, F60.H1;\n" .
+                     "--:-:4:-:1      F2F.F32.F16 F60, F60.H0;\n",
+
+            j1c0  => "--:-:-:-:1      F2F.F32.F16 F73, F71.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F72, F71.H0;\n",
+            j1c1  => "--:-:-:-:1      F2F.F32.F16 F71, F70.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 F70, F70.H0;\n",
+
+            j1c2  => "--:-:-:-:1      F2F.F32.F16 F83, F81.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 F82, F81.H0;\n",
+            j1c3  => "--:-:-:-:1      F2F.F32.F16 F81, F80.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 F80, F80.H0;\n",
+
+            j1c4  => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], F6;\n",
+            j1c5  => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], F7;\n",
+            j1c6  => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], F8;\n",
+
+            j1c8  => "08:-:-:-:1  \@P1 LDG.E.$vsize F6, [track + 4x<6*32 * $dsize>];\n",
+            j1c9  => "10:-:-:-:1  \@P1 LDG.E.$vsize F7, [track + 4x<7*32 * $dsize>];\n",
+            j1c10 => "20:6:4:-:1  \@P1 LDG.E.$vsize F8, [track + 4x<8*32 * $dsize>];\n",
+
+        ) : (
+
+            j0c6  => "02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], F0;\n",
+            j0c8  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], F1;\n",
+            j0c10 => "--:2:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], F2;\n",
+
+            j0c12 => "02:-:-:-:1  \@P1 LDG.E.$vsize F0, [track + 4x<0*32 * $dsize>];\n",
+            j0c14 => "--:-:-:-:1  \@P1 LDG.E.$vsize F1, [track + 4x<1*32 * $dsize>];\n",
+            j0c16 => "--:-:2:-:1  \@P1 LDG.E.$vsize F2, [track + 4x<2*32 * $dsize>];\n",
+
+            j0c20 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], F3;\n",
+            j0c22 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], F4;\n",
+            j0c24 => "--:3:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], F5;\n",
+
+            j0c26 => "04:-:-:-:1  \@P1 LDG.E.$vsize F3, [track + 4x<3*32 * $dsize>];\n",
+            j0c28 => "--:-:-:-:1  \@P1 LDG.E.$vsize F4, [track + 4x<4*32 * $dsize>];\n",
+            j0c30 => "--:-:3:-:1  \@P1 LDG.E.$vsize F5, [track + 4x<5*32 * $dsize>];\n",
+
+            j1c0  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], F6;\n",
+            j1c2  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], F7;\n",
+            j1c4  => "--:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], F8;\n",
+
+            j1c6  => "08:-:-:-:1  \@P1 LDG.E.$vsize F6, [track + 4x<6*32 * $dsize>];\n",
+            j1c8  => "--:-:-:-:1  \@P1 LDG.E.$vsize F7, [track + 4x<7*32 * $dsize>];\n",
+            j1c10 => "--:6:4:-:1  \@P1 LDG.E.$vsize F8, [track + 4x<8*32 * $dsize>];\n",
+        ),
+
+        j1c11 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c31 => "--:-:-:Y:5  \@P0 BRA.U FILTER_LOOP;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "01" : '--';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+LOAD_FINISH:
+
+//--:-:-:-:5      EXIT;
+
+
+--:-:1:-:2      S2R Tid, SR_TID.X;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha16, param_alpha;
+
+01:-:-:-:1      LOP.AND  Tid32_2,  Tid,    -32;
+--:-:-:-:1      SHR.U32  Tid32_2,  Tid32_2, 2;
+
+// readFs = ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND Tid1,   Tid,    1;
+01:-:-:-:1      LOP.AND readFs, Tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD    readFs, readFs, Tid1;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readFs << 2)
+--:-:-:-:1      BFE.U32 readIs, Tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, Tid32_2;
+--:-:-:-:1      ISCADD  readIs, readFs, readIs, 2;
+
+--:-:-:-:1      SHL readIs, readIs, 4;
+--:-:-:-:1      SHL readFs, readFs, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:1      XMAD write16Cs, readFs, 1x<32*36>, readIs;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y2, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y2, alpha16;
+--:-:-:-:4      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y3, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y3, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y6, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y6, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y7, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y7, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      EXIT;
+
+
+COMPUTE_FINISH:
+
+//--:-:-:-:5      EXIT;
+
+
+--:-:1:-:2      S2R tid_128, SR_TID.X;
+<SCHEDULE_BLOCK>
+
+01:-:-:-:1      IADD tid_128, tid_128, -128;
+
+--:-:-:-:1      ISETP.GE.AND P6, PT, tid_128, 256, PT;
+
+// readFs = ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  Tid_1,   tid_128, 1;
+--:-:-:-:1      LOP.AND  readFs2, tid_128, 8;
+--:-:-:-:1      SHR.U32  readFs2, readFs2, 2;
+--:-:-:-:1      IADD     readFs2, readFs2, Tid_1;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2)
+--:-:-:-:1      LOP.AND  tid_16,   tid_128, -16;
+--:-:-:-:1      SHR.U32  tid_16,   tid_16,   1;
+--:-:-:-:1      BFE.U32  readIs2,  tid_128,  0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs2,  readIs2,  tid_16;
+--:-:-:-:1      ISCADD   readIs2,  readFs2, readIs2, 2;
+
+--:-:-:-:1      ISCADD   readIs2, readIs2, 4x<32*4>, 4;
+--:-:-:-:1      SHL      readFs2, readFs2, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:0      XMAD writeCs, readFs2, 1x<32*36>, readIs2;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P6 BRA.U SKIP0;
+
+--:-:2:-:1      LDS idxX, [addr_idx_X];
+--:-:3:-:1      LDS idxY, [addr_idx_Y];
+--:-:1:-:1      S2R idxN,  SR_CTAID.Z;
+--:-:4:-:1      LDS idxK, [addr_idx_K];
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND tid_31, tid_128, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid_128,  5;
+--:-:-:-:1      SHR.U32 tid_64, tid_128,  6;
+
+
+// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16
+--:-:-:-:1      XMAD   readCs, tid_32, 1x<32*36>, tid_31;
+--:-:-:-:1      ISCADD readCs, tid_64, readCs, 4;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// Superblock offset
+// idxX <<= shiftX
+// idxX <<= shiftY
+04:-:-:-:1      SHL idxY, idxY, param_shiftY;
+02:-:-:-:1      SHL idxX, idxX, param_shiftX;
+01:-:-:-:1      SHL idxN, idxN, param_shiftN;
+
+// Get this threads offset within the superblock
+--:-:-:-:1      BFE.U32 p, tid_31, param_SuperY;
+--:-:-:-:1      BFE.U32 q, tid_31, param_SuperX;
+--:-:-:-:1      LOP.AND n, tid_31, param_SuperN;
+
+--:-:-:-:1      ISCADD q, q, idxX, 1;
+--:-:-:-:1      ISCADD p, p, idxY, 1;
+
+--:-:-:-:1      MOV four, -4;
+--:-:-:-:1      IADD3 q, q, param_pad_x, four;
+--:-:-:-:1      IADD3 p, p, param_pad_y, four;
+
+[+
+    our ($type, $N);
+    if ($type eq 'h')
+    {
+        return q{
+--:-:-:-:1      SHL tid31_4, tid_31, 2;
+
+--:-:-:-:1      ISCADD n, n, idxN, 1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tid_31, 16, PT;
+        }
+    }
+    else {
+        return q{
+--:-:-:-:1      IADD n, n, idxN;
+--:-:-:-:1      ISETP.LT.AND P6, PT, n, param_N, PT;
+        };
+    }
++]
+
+// k = idxK*32 + tid_32<<1
+--:-:-:-:1      SHL tid_32, tid_32,   1;
+08:-:-:-:1      ISCADD k, idxK, tid_32, 5;
+
+// Out = k*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD.S16.U16      offsetO, q, param_N,    n;
+--:-:-:-:1      XMAD.S16.U16.LO2C offsetO, p, param_QN,   offsetO;
+--:-:-:-:1      XMAD.U16.U16.LO2C offsetO, k, param_PQN,  offsetO;
+--:-:-:-:1      ISET.LT.AND sign, offsetO, RZ, PT;
+
+--:-:-:-:1      LEA    Out0.CC, offsetO, param_O[0], [+ dshift() +];
+--:-:-:-:1      IADD.X Out1,    sign,    param_O[1];
+
+--:-:-:-:1      ISETP.EQ.AND P5, PT, RZ, param_flags, PT; // ! no-op
+
+--:-:-:-:1      IADD z1, q, 1;
+--:-:-:-:1      IADD z2, q, 2;
+--:-:-:-:1      IADD z3, q, 3;
+--:-:-:-:1      IADD z4, q, 4;
+--:-:-:-:1      IADD z5, q, 5;
+--:-:-:-:1      ISETP.LT.AND P0, PT, q,  param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, z4, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P5, PT, z5, param_Q, P5;
+--:-:-:-:1      ISETP.GE.AND P0, PT, q,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, z4, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, z5, RZ, P5;
+--:-:-:-:1      P2R mask_q, PR, RZ, 0x3f;
+
+--:-:-:-:1      IADD z1, p, 1;
+--:-:-:-:1      IADD z2, p, 2;
+--:-:-:-:1      IADD z3, p, 3;
+--:-:-:-:1      IADD z4, p, 4;
+--:-:-:-:1      IADD z5, p, 5;
+--:-:-:-:1      ISETP.LT.AND P0, PT, p,  param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P4, PT, z4, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, z5, param_P, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, p,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, z4, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, z5, RZ, P5;
+
+--:-:-:-:1      SEL pred30, mask_q, RZ, P0;
+--:-:-:-:1  @P1 BFI pred30, mask_q, 0x606, pred30;
+--:-:-:-:1  @P2 BFI pred30, mask_q, 0x60c, pred30;
+--:-:-:-:1  @P3 BFI pred30, mask_q, 0x612, pred30;
+--:-:-:-:1  @P4 BFI pred30, mask_q, 0x618, pred30;
+--:-:-:-:1      SEL pred36, mask_q, RZ, P5;
+
+--:-:-:-:1      ISETP.GE.AND P6, PT, tid_128, 256, PT;
+
+</SCHEDULE_BLOCK>
+
+SKIP0:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, ccx7y0, param_alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, ccx3y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y2, param_alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, ccx7y2, param_alpha;
+
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P6 BRA.U SKIP1;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP1:
+
+--:-:-:-:0      IADD k, k, 1;
+--:-:-:-:5      BAR.SYNC 0;
+01:-:-:-:1      IADD Out0.CC, Out0, param_PQNp;
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y1, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y1, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y1, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y3, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y3, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y3, param_alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y3, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y3, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y3, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y3, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y3, param_alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:0      IADD.X Out1, Out1, RZ;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P6 BRA.U SKIP2;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP2:
+
+--:-:-:-:0      IADD k, k, 15;
+--:-:-:-:5      BAR.SYNC 0;
+01:-:-:-:1      IADD Out0.CC, Out0, param_PQN15p;
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y4, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y4, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y4, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y6, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y6, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y6, param_alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y6, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y6, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y6, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y6, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y6, param_alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:0      IADD.X Out1, Out1, RZ;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P6 BRA.U SKIP3;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP3:
+
+--:-:-:-:0      IADD k, k, 1;
+--:-:-:-:5      BAR.SYNC 0;
+01:-:-:-:1      IADD Out0.CC, Out0, param_PQNp;
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y5, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y5, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y5, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y7, param_alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y7, param_alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y7, param_alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y7, param_alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y7, param_alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y7, param_alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y7, param_alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y7, param_alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:0      IADD.X Out1, Out1, RZ;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P6 BRA.U SKIP4;S
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP4:
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K, PT;
+--:-:-:-:1 @!P0 MOV pred30, RZ;
+--:-:-:-:1 @!P0 MOV pred36, RZ;
+[+
+    my $out;
+    foreach my $i (0 .. 2)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $i + 1;
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+// t0 = I[1,:] + I[2,:]
+// t1 = I[1,:] - I[2,:]
+// t2 = I[3,:] + I[4,:]
+// t3 = I[3,:] - I[4,:]
+// O[2,:] = t0 * -2.25   + t2 * -0.5625  + I[0,:] * -2.8125
+// O[1,:] = t1 * -1.6875 + t3 * -0.84375 + I[5,:] *  1.265625
+// O[3,:] = t1 *  0.75   + t3 *  1.5     + I[5,:] * -2.8125
+// O[4,:] = I[0,:] + t0  + t2
+// O[0,:] = I[0,:] * 1.265625
+// O[5,:] = I[5,:]
+[+
+    my $out;
+    foreach my $i (0 .. 2)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
+--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD t2$i, m3$i,  m4$i;
+--:-:-:-:1      FADD t3$i, m3$i, -m4$i;
+--:-:-:-:1      FMUL w2$i, m0$i, -2.8125;
+--:-:-:-:1      FFMA w2$i, t0$i, -2.25,    w2$i;
+--:-:-:-:1      FFMA w2$i, t2$i, -0.5625,  w2$i;
+--:-:-:-:1      FMUL w1$i, m5$i,  1.265625;
+--:-:-:-:1      FFMA w1$i, t1$i, -1.6875,  w1$i;
+--:-:-:-:1      FFMA w1$i, t3$i, -0.84375, w1$i;
+--:-:-:-:1      FMUL w3$i, m5$i, -2.8125;
+--:-:-:-:1      FFMA w3$i, t1$i,  0.75,    w3$i;
+--:-:-:-:1      FFMA w3$i, t3$i,  1.5,     w3$i;
+--:-:-:-:1      FADD w4$i, m0$i,  t0$i;
+--:-:-:-:1      FADD w4$i, w4$i,  t2$i;
+--:-:-:-:1      FMUL w0$i, m0$i,  1.265625;
+        };
+    }
+    foreach my $i (3 .. 5)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $i + 1;
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (3 .. 5)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
+--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD t2$i, m3$i,  m4$i;
+--:-:-:-:1      FADD t3$i, m3$i, -m4$i;
+--:-:-:-:1      FMUL w2$i, m0$i, -2.8125;
+--:-:-:-:1      FFMA w2$i, t0$i, -2.25,    w2$i;
+--:-:-:-:1      FFMA w2$i, t2$i, -0.5625,  w2$i;
+--:-:-:-:1      FMUL w1$i, m5$i,  1.265625;
+--:-:-:-:1      FFMA w1$i, t1$i, -1.6875,  w1$i;
+--:-:-:-:1      FFMA w1$i, t3$i, -0.84375, w1$i;
+--:-:-:-:1      FMUL w3$i, m5$i, -2.8125;
+--:-:-:-:1      FFMA w3$i, t1$i,  0.75,    w3$i;
+--:-:-:-:1      FFMA w3$i, t3$i,  1.5,     w3$i;
+--:-:-:-:1      FADD w4$i, m0$i,  t0$i;
+--:-:-:-:1      FADD w4$i, w4$i,  t2$i;
+--:-:-:-:1      FMUL w0$i, m0$i,  1.265625;
+        };
+    }
+    return $out;
++]
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        $out .= qq{
+--:-:-:-:1      FADD r${i}0, w${i}1,  w${i}2;
+--:-:-:-:1      FADD r${i}1, w${i}1, -w${i}2;
+--:-:-:-:1      FADD r${i}2, w${i}3,  w${i}4;
+--:-:-:-:1      FADD r${i}3, w${i}3, -w${i}4;
+--:-:-:-:1      FMUL s${i}2, w${i}0, -2.8125;
+--:-:-:-:1      FFMA s${i}2, r${i}0, -2.25,    s${i}2;
+--:-:-:-:1      FFMA s${i}2, r${i}2, -0.5625,  s${i}2;
+--:-:-:-:1      FMUL s${i}1, w${i}5,  1.265625;
+--:-:-:-:1      FFMA s${i}1, r${i}1, -1.6875,  s${i}1;
+--:-:-:-:1      FFMA s${i}1, r${i}3, -0.84375, s${i}1;
+--:-:-:-:1      FMUL s${i}3, w${i}5, -2.8125;
+--:-:-:-:1      FFMA s${i}3, r${i}1,  0.75,    s${i}3;
+--:-:-:-:1      FFMA s${i}3, r${i}3,  1.5,     s${i}3;
+--:-:-:-:1      FADD s${i}4, w${i}0,  r${i}0;
+--:-:-:-:1      FADD s${i}4, s${i}4,  r${i}2;
+--:-:-:-:1      FMUL s${i}0, w${i}0,  1.265625;
+        };
+    }
+    return $out;
++]
+[+
+    our $type;
+    return $type eq 'h' ? q{
+
+--:-:-:-:1      IADD readCs, readCs, -tid31_4;
+--:-:-:-:1      SHR.U32 tid31_4, tid31_4, 1;
+--:-:-:-:1      IADD readCs, readCs, tid31_4;
+
+<ORDERED>
+--:-:-:-:1      F2F.F16.F32 s05, s05;
+--:-:-:-:1      F2F.F16.F32 s00, s00;
+--:-:-:-:1      F2F.F16.F32 s02, s02;
+--:-:-:-:1      F2F.F16.F32 s01, s01;
+--:-:-:-:1      F2F.F16.F32 s03, s03;
+--:-:1:-:1      F2F.F16.F32 s04, s04;
+
+--:-:-:-:1      F2F.F16.F32 s15, s15;
+--:-:-:-:1      F2F.F16.F32 s10, s10;
+--:-:-:-:1      F2F.F16.F32 s12, s12;
+--:-:-:-:1      F2F.F16.F32 s11, s11;
+--:-:-:-:1      F2F.F16.F32 s13, s13;
+--:-:2:-:1      F2F.F16.F32 s14, s14;
+
+01:-:-:-:1      STS.U16 [readCs + 4x<(0*6+0)*32>], s00;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+1)*32>], s01;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+2)*32>], s02;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+3)*32>], s03;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+4)*32>], s04;
+--:-:-:-:1      STS.U16 [readCs + 4x<(0*6+5)*32>], s05;
+
+--:-:-:-:1      F2F.F16.F32 s25, s25;
+--:-:-:-:1      F2F.F16.F32 s20, s20;
+--:-:-:-:1      F2F.F16.F32 s22, s22;
+--:-:-:-:1      F2F.F16.F32 s21, s21;
+--:-:-:-:1      F2F.F16.F32 s23, s23;
+--:-:3:-:1      F2F.F16.F32 s24, s24;
+
+02:-:-:-:1      STS.U16 [readCs + 4x<(1*6+0)*32>], s10;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+1)*32>], s11;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+2)*32>], s12;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+3)*32>], s13;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+4)*32>], s14;
+--:-:-:-:1      STS.U16 [readCs + 4x<(1*6+5)*32>], s15;
+
+--:-:-:-:1      F2F.F16.F32 s35, s35;
+--:-:-:-:1      F2F.F16.F32 s30, s30;
+--:-:-:-:1      F2F.F16.F32 s32, s32;
+--:-:-:-:1      F2F.F16.F32 s31, s31;
+--:-:-:-:1      F2F.F16.F32 s33, s33;
+--:-:4:-:1      F2F.F16.F32 s34, s34;
+
+04:-:-:-:1      STS.U16 [readCs + 4x<(2*6+0)*32>], s20;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+1)*32>], s21;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+2)*32>], s22;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+3)*32>], s23;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+4)*32>], s24;
+--:-:-:-:1      STS.U16 [readCs + 4x<(2*6+5)*32>], s25;
+
+--:-:-:-:1      F2F.F16.F32 s45, s45;
+--:-:-:-:1      F2F.F16.F32 s40, s40;
+--:-:-:-:1      F2F.F16.F32 s42, s42;
+--:-:-:-:1      F2F.F16.F32 s41, s41;
+--:-:-:-:1      F2F.F16.F32 s43, s43;
+--:-:5:-:1      F2F.F16.F32 s44, s44;
+
+08:-:-:-:1      STS.U16 [readCs + 4x<(3*6+0)*32>], s30;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+1)*32>], s31;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+2)*32>], s32;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+3)*32>], s33;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+4)*32>], s34;
+--:-:-:-:1      STS.U16 [readCs + 4x<(3*6+5)*32>], s35;
+
+--:-:-:-:1      F2F.F16.F32 s55, s55;
+--:-:-:-:1      F2F.F16.F32 s50, s50;
+--:-:-:-:1      F2F.F16.F32 s52, s52;
+--:-:-:-:1      F2F.F16.F32 s51, s51;
+--:-:-:-:1      F2F.F16.F32 s53, s53;
+--:-:6:-:1      F2F.F16.F32 s54, s54;
+
+10:-:-:-:1      STS.U16 [readCs + 4x<(4*6+0)*32>], s40;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+1)*32>], s41;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+2)*32>], s42;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+3)*32>], s43;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+4)*32>], s44;
+--:-:-:-:1      STS.U16 [readCs + 4x<(4*6+5)*32>], s45;
+
+20:-:-:-:1      STS.U16 [readCs + 4x<(5*6+0)*32>], s50;
+--:-:-:-:1      STS.U16 [readCs + 4x<(5*6+1)*32>], s51;
+--:-:-:-:1      STS.U16 [readCs + 4x<(5*6+2)*32>], s52;
+--:-:-:-:1      STS.U16 [readCs + 4x<(5*6+3)*32>], s53;
+--:-:-:-:1      STS.U16 [readCs + 4x<(5*6+4)*32>], s54;
+--:1:-:-:2      STS.U16 [readCs + 4x<(5*6+5)*32>], s55; // FORCE
+</ORDERED>
+
+01:-:-:-:1      IADD readCs, readCs, -tid31_4;
+--:-:-:-:1      SHL tid31_4, tid31_4, 1;
+--:-:-:-:4      IADD readCs, readCs, tid31_4;
+
+    } : q{
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 0*$N>], s00;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 1*$N>], s01;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 2*$N>], s02;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 3*$N>], s03;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 4*$N>], s04;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<0*$Q*$N + 5*$N>], s05;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 0*$N>], s10;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 1*$N>], s11;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 2*$N>], s12;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 3*$N>], s13;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 4*$N>], s14;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<1*$Q*$N + 5*$N>], s15;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 0*$N>], s20;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 1*$N>], s21;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 2*$N>], s22;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 3*$N>], s23;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 4*$N>], s24;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<2*$Q*$N + 5*$N>], s25;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 0*$N>], s30;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 1*$N>], s31;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 2*$N>], s32;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 3*$N>], s33;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 4*$N>], s34;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<3*$Q*$N + 5*$N>], s35;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.L.U64 pred30, pred30, 24, pred30;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 0*$N>], s40;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 1*$N>], s41;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 2*$N>], s42;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 3*$N>], s43;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 4*$N>], s44;
+--:-:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<4*$Q*$N + 5*$N>], s45;
+--:-:-:-:1      R2P PR, pred36, 0x3f;
+--:-:-:-:1  @P0 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 0*$N>], s50;
+--:-:-:-:1  @P1 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 1*$N>], s51;
+--:-:-:-:1  @P2 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 2*$N>], s52;
+--:-:-:-:1  @P3 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 3*$N>], s53;
+--:-:-:-:1  @P4 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 4*$N>], s54;
+--:1:-:-:1  @P5 RED.E.ADD.F32.FTZ.RN [Out + 4x<5*$Q*$N + 5*$N>], s55;
+    };
++]
+</SCHEDULE_BLOCK>
+
+[+
+    our $type;
+    return $type eq 'h' ? q{
+--:-:-:-:1      LDS.U.32 s00, [readCs + 4x<(0*6+0)*32>];
+--:-:-:-:1      LDS.U.32 s01, [readCs + 4x<(0*6+1)*32>];
+--:-:-:-:1      LDS.U.32 s02, [readCs + 4x<(0*6+2)*32>];
+--:-:-:-:1      LDS.U.32 s03, [readCs + 4x<(0*6+3)*32>];
+--:-:-:-:1      LDS.U.32 s04, [readCs + 4x<(0*6+4)*32>];
+--:-:1:-:1      LDS.U.32 s05, [readCs + 4x<(0*6+5)*32>];
+
+--:-:-:-:1      LDS.U.32 s10, [readCs + 4x<(1*6+0)*32>];
+--:-:-:-:1      LDS.U.32 s11, [readCs + 4x<(1*6+1)*32>];
+--:-:-:-:1      LDS.U.32 s12, [readCs + 4x<(1*6+2)*32>];
+--:-:-:-:1      LDS.U.32 s13, [readCs + 4x<(1*6+3)*32>];
+--:-:-:-:1      LDS.U.32 s14, [readCs + 4x<(1*6+4)*32>];
+--:-:2:-:1      LDS.U.32 s15, [readCs + 4x<(1*6+5)*32>];
+
+--:-:-:-:1      LDS.U.32 s20, [readCs + 4x<(2*6+0)*32>];
+--:-:-:-:1      LDS.U.32 s21, [readCs + 4x<(2*6+1)*32>];
+--:-:-:-:1      LDS.U.32 s22, [readCs + 4x<(2*6+2)*32>];
+--:-:-:-:1      LDS.U.32 s23, [readCs + 4x<(2*6+3)*32>];
+--:-:-:-:1      LDS.U.32 s24, [readCs + 4x<(2*6+4)*32>];
+--:-:3:-:1      LDS.U.32 s25, [readCs + 4x<(2*6+5)*32>];
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+01:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 0*$N>], s00;
+--:-:-:-:1      LDS.U.32 s30, [readCs + 4x<(3*6+0)*32>];
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 1*$N>], s01;
+--:-:-:-:1      LDS.U.32 s31, [readCs + 4x<(3*6+1)*32>];
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 2*$N>], s02;
+--:-:-:-:1      LDS.U.32 s32, [readCs + 4x<(3*6+2)*32>];
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 3*$N>], s03;
+--:-:-:-:1      LDS.U.32 s33, [readCs + 4x<(3*6+3)*32>];
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 4*$N>], s04;
+--:-:-:-:1      LDS.U.32 s34, [readCs + 4x<(3*6+4)*32>];
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<0*$Q*$N + 5*$N>], s05;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:4:-:1      LDS.U.32 s35, [readCs + 4x<(3*6+5)*32>];
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1      LDS.U.32 s40, [readCs + 4x<(4*6+0)*32>];
+02:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 0*$N>], s10;
+--:-:-:-:1      LDS.U.32 s41, [readCs + 4x<(4*6+1)*32>];
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 1*$N>], s11;
+--:-:-:-:1      LDS.U.32 s42, [readCs + 4x<(4*6+2)*32>];
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 2*$N>], s12;
+--:-:-:-:1      LDS.U.32 s43, [readCs + 4x<(4*6+3)*32>];
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 3*$N>], s13;
+--:-:-:-:1      LDS.U.32 s44, [readCs + 4x<(4*6+4)*32>];
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 4*$N>], s14;
+--:-:5:-:1      LDS.U.32 s45, [readCs + 4x<(4*6+5)*32>];
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<1*$Q*$N + 5*$N>], s15;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      LDS.U.32 s50, [readCs + 4x<(5*6+0)*32>];
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1      LDS.U.32 s51, [readCs + 4x<(5*6+1)*32>];
+04:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 0*$N>], s20;
+--:-:-:-:1      LDS.U.32 s52, [readCs + 4x<(5*6+2)*32>];
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 1*$N>], s21;
+--:-:-:-:1      LDS.U.32 s53, [readCs + 4x<(5*6+3)*32>];
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 2*$N>], s22;
+--:-:-:-:1      LDS.U.32 s54, [readCs + 4x<(5*6+4)*32>];
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 3*$N>], s23;
+--:-:6:-:1      LDS.U.32 s55, [readCs + 4x<(5*6+5)*32>];
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 4*$N>], s24;
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<2*$Q*$N + 5*$N>], s25;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+
+08:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 0*$N>], s30;
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 1*$N>], s31;
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 2*$N>], s32;
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 3*$N>], s33;
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 4*$N>], s34;
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<3*$Q*$N + 5*$N>], s35;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.L.U64 pred30, pred30, 24, pred30;
+10:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 0*$N>], s40;
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 1*$N>], s41;
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 2*$N>], s42;
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 3*$N>], s43;
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 4*$N>], s44;
+--:-:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<4*$Q*$N + 5*$N>], s45;
+--:-:-:-:1      R2P PR, pred36, 0x3f;
+20:-:-:-:1  @P0 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 0*$N>], s50;
+--:-:-:-:1  @P1 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 1*$N>], s51;
+--:-:-:-:1  @P2 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 2*$N>], s52;
+--:-:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 3*$N>], s53;
+--:-:-:-:1  @P4 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 4*$N>], s54;
+--:1:-:-:1  @P5 RED.E.ADD.F16x2.FTZ.RN [Out + 2x<5*$Q*$N + 5*$N>], s55;
+</ORDERED>
+</SCHEDULE_BLOCK>
+    } : '';
++]
+
+--:-:-:-:5      RET;
+
+// RED.E.ADD.F16x2.FTZ.RN
\ No newline at end of file
diff --git a/Kernel/Convolution/Pascal/xconv_winograd_3x3_2x2_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_3x3_2x2_32x32.sass
new file mode 100644
index 0000000..fe1dc07
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_winograd_3x3_2x2_32x32.sass
@@ -0,0 +1,1814 @@
+
+# Copyright 2015 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our ($type, $IX, $D);
+our $determ = $D;
+our $dtype        = $type eq 'h' ?        '.U16' : '';
+our $convert_in   = $type eq 'h' ? 'F2F.F32.F16' : '';
+our $convert_out  = $type eq 'h' ? 'F2F.F16.F32' : '';
+our $vec_size     = $type eq 'h' ?          '64' : '128';
+our $dtype_shift  = $type eq 'h' ?           '1' : '2';
+our $dtype_size   = $type eq 'h' ?           '2' : '4';
+sub dtype       { return $dtype;       }
+sub dtype_shift { return $dtype_shift; }
+sub output_op   { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero  : 4x<(512*4 + 32)*4 + 0>
+    addr_blk_K : 4x<(512*4 + 32)*4 + 4>
+    addr_blk_C : 4x<(512*4 + 32)*4 + 5>
+    addr_blk_P : 4x<(512*4 + 32)*4 + 6>
+    addr_blk_Q : 4x<(512*4 + 32)*4 + 7>
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_Y            : c[0x0][0x15c]
+    param_X            : c[0x0][0x160]
+    param_P            : c[0x0][0x164]
+    param_Q            : c[0x0][0x168]
+    param_C            : c[0x0][0x16c]
+    param_K            : c[0x0][0x170]
+    param_N            : c[0x0][0x174]
+    param_pad_y        : c[0x0][0x178]
+    param_pad_x        : c[0x0][0x17c]
+    param_GY           : c[0x0][0x180]
+    param_GX           : c[0x0][0x184]
+    param_GYS          : c[0x0][0x188]
+    param_GXS          : c[0x0][0x18c]
+    param_shiftYI      : c[0x0][0x190]
+    param_shiftXI      : c[0x0][0x194]
+    param_superYI      : c[0x0][0x198]
+    param_superXI      : c[0x0][0x19c]
+    param_superNI      : c[0x0][0x1a0]
+    param_shiftY       : c[0x0][0x1a4]
+    param_shiftX       : c[0x0][0x1a8]
+    param_superY       : c[0x0][0x1ac]
+    param_superX       : c[0x0][0x1b0]
+    param_superN       : c[0x0][0x1b4]
+    param_loopXI       : c[0x0][0x1b8]
+    param_loopX        : c[0x0][0x1bc]
+    param_loopN        : c[0x0][0x1c0]
+    param_strideY      : c[0x0][0x1c4]
+    param_strideX      : c[0x0][0x1c8]
+    param_XN           : c[0x0][0x1cc]
+    param_YXN          : c[0x0][0x1d0]
+    param_QN           : c[0x0][0x1d4]
+    param_PQN          : c[0x0][0x1d8]
+    param_SK           : c[0x0][0x1dc]
+    param_RSK          : c[0x0][0x1e0]
+    param_Np           : c[0x0][0x1e4]
+    param_XNp          : c[0x0][0x1e8]
+    param_2XNp         : c[0x0][0x1ec]
+    param_QNp          : c[0x0][0x1f0]
+    param_CPQkc        : c[0x0][0x1f4]
+    param_PQkc         : c[0x0][0x1f8]
+    param_Qkc          : c[0x0][0x1fc]
+    param_kc           : c[0x0][0x200]
+    param_c            : c[0x0][0x204]
+    param_k            : c[0x0][0x208]
+    param_magic_CPQkc  : c[0x0][0x20c]
+    param_shift_CPQkc  : c[0x0][0x210]
+    param_magic_PQkc   : c[0x0][0x214]
+    param_shift_PQkc   : c[0x0][0x218]
+    param_magic_Qkc    : c[0x0][0x21c]
+    param_shift_Qkc    : c[0x0][0x220]
+    param_magic_kc     : c[0x0][0x224]
+    param_shift_kc     : c[0x0][0x228]
+    param_magic_c      : c[0x0][0x22c]
+    param_shift_c      : c[0x0][0x230]
+    param_CRSK         : c[0x0][0x234]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+
+      64-79 : j0Ex<0-7>, j0Iy<0-7>
+      80-95 : j1Ex<0-7>, j1Iy<0-7>
+
+      64-79 ~ blk_KCPQkc, blk_CPQkc, blk_PQkc, blk_Qkc, blk_kc, blk_k, blk_c, blk_K, blk_C, blk_P, magic_CPQkc, magic_PQkc, magic_Qkc
+      84-95 ~ div1, div2, div3, tidX, tidY, tid16, tid1, neg_CPQkc, neg_PQkc, neg_Qkc, neg_kc, neg_c
+
+      80-82 : init, tid, blk_Q
+         83 = blkC, blkK
+      84-95 ~ x, x<1-3>, y, super_x, super_y, tid_X, c, offsign, mask_x, mask_y
+      84-95 ~ nloop, N
+         81 = off_sign
+         64 = swapBuf
+
+     96-103 : track0<0-1>, track1<0-1>, track2<0-1>, track3<0-1>
+
+    120-127 ~ writeS, readEs, readIs, pred_bits, gys, gxs, n, offset
+
+       0-31 : m0<0-3>, m1<0-3>, m2<0-3>, m3<0-3>, t0<0-3>, t1<0-3>, t2<0-3>
+      64-72 : f0<0-2>, f1<0-2>, f2<0-2>
+      76-79 : blkKCPQ<0-3>
+      76-79 : K_blk, C_blk, P_blk, Q_blk
+      84-95 ~ CRSK, xmad_determ, PQ_blk
+     96-109 ~ alpha, writeCs, readCs, cc, RSK8, tid_1, tid_16, tid_31, tid_32, kk, trackF, K1, SK1
+    110-115 : F00_<0-1>, F01_<0-1>, F02_<0-1>,
+    116-121 : F10_<0-1>, F11_<0-1>, F12_<0-1>,
+    122-127 : F20_<0-1>, F21_<0-1>, F22_<0-1>
+[+
+    our $IX;
+    return $IX ? q{
+      96-99 : trackI<0-1>, offsetI<0-1>
+    100-103 ~ swapBuffer, gy, gx
+
+    104-119 : I0<0-3>, I1<0-3>, I2<0-3>, I3<0-3>
+    } : q{
+    // registers reorded to avoid bank conflicts
+    104 = y0x0, Y0X0, I00, Y1X0
+    105 = y0x1, Y0X1, I02, Y1X2
+    106 = y0x2, Y0X2, I13
+    107 = y0x3, Y0X3, I03, Y1X3
+    108 = y1x0, I04
+    110 = y1x1, I05
+    109 = y1x2, I06
+    111 = y1x3, I07
+    113 = y2x0, Y2X0, I08
+    112 = y2x1, Y2X1
+    119 = y2x2, Y2X2, I10
+    117 = y2x3, Y2X3, I11
+    115 = y3x0, Y3X0, I12
+    116 = y3x1, Y3X1, I14
+    114 = y3x2, Y3X2, I09
+    118 = y3x3, Y3X3, I15
+    80  = I01
+    64  = Y1X1
+    };
++]
+    // Error registers
+    104 = p0q0, E00
+    105 = p0q1, E03
+    106 = p1q0, E12
+    107 = p1q1, E15
+    108 = e0, C0, E08
+    109 = E01
+    110 = E02
+    111 = e1, C1, E11
+    112 = E13
+    113 = E14
+    114 = B0, E04
+    115 = B1, E07
+    116 = e2, E06
+    117 = e3, E10
+    118 = E05
+    119 = E09
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,        SR_TID.X;
+--:-:2:-:1      S2R blk_KCPQkc, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 128, PT;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+
+--:-:-:-:1      MOV  magic_CPQkc,    param_magic_CPQkc;
+--:-:-:-:1      MOV  magic_PQkc,     param_magic_PQkc;
+--:-:-:-:1      MOV  magic_Qkc,      param_magic_Qkc;
+--:-:-:-:1      IADD neg_CPQkc, RZ, -param_CPQkc;
+--:-:-:-:1      IADD neg_PQkc,  RZ, -param_PQkc;
+--:-:-:-:1      IADD neg_Qkc,   RZ, -param_Qkc;
+--:-:-:-:1      IADD neg_kc,    RZ, -param_kc;
+--:-:-:-:1      IADD neg_c,     RZ, -param_c;
+
+--:-:-:-:1      ISETP.NE.AND P1, PT, magic_CPQkc, 1, PT;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magic_PQkc,  1, PT;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_Qkc,   1, PT;
+
+// blk_K = blk_KCPQkc / CPQkc
+02:-:-:-:1  @P1 XMAD     div1, blk_KCPQkc,    magic_CPQkc,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, blk_KCPQkc,    magic_CPQkc.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, blk_KCPQkc.H1, magic_CPQkc.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, blk_KCPQkc.H1, magic_CPQkc,    div1;
+--:-:-:-:1  @P1 IADD3.RS blk_K, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  blk_K, blk_K,      param_shift_CPQkc;
+--:-:-:-:1 @!P1 SHR.U32  blk_K, blk_KCPQkc, param_shift_CPQkc;
+
+// blk_CPQkc = blk_KCPQkc % CPQkc
+--:-:-:-:1      XMAD.LO2 blk_CPQkc, neg_CPQkc, blk_K, blk_KCPQkc;
+
+// blk_C = blk_CPQkc / PQkc
+--:-:-:-:1  @P2 XMAD     div1, blk_CPQkc,    magic_PQkc,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, blk_CPQkc,    magic_PQkc.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, blk_CPQkc.H1, magic_PQkc.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, blk_CPQkc.H1, magic_PQkc,    div1;
+--:-:-:-:1  @P2 IADD3.RS blk_C, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  blk_C, blk_C,     param_shift_PQkc;
+--:-:-:-:1 @!P2 SHR.U32  blk_C, blk_CPQkc, param_shift_PQkc;
+
+// blk_PQkc = blk_CPQkc % PQkc
+--:-:-:-:1      XMAD.LO2 blk_PQkc, neg_PQkc, blk_C, blk_CPQkc;
+
+// blk_P = blk_PQkc / Qkc
+--:-:-:-:1  @P3 XMAD     div1, blk_PQkc,    magic_Qkc,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, blk_PQkc,    magic_Qkc.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, blk_PQkc.H1, magic_Qkc.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, blk_PQkc.H1, magic_Qkc,    div1;
+--:-:-:-:1  @P3 IADD3.RS blk_P, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  blk_P, blk_P,    param_shift_Qkc;
+--:-:-:-:1 @!P3 SHR.U32  blk_P, blk_PQkc, param_shift_Qkc;
+
+// blk_Qkc = blk_PQkc % Qkc
+--:-:-:-:1      XMAD.LO2 blk_Qkc, neg_Qkc, blk_P, blk_PQkc;
+
+// blk_Q  = blk_Qkc / kc
+--:-:-:-:1      XMAD.LO2C blk_Q, blk_Qkc, param_magic_kc, RZ;
+--:-:-:-:1      SHR.U32 blk_Q, blk_Q, param_shift_kc;
+// blk_kc = blk_Qkc % kc
+--:-:-:-:1      XMAD.S16.U16  blk_kc, neg_kc, blk_Q, blk_Qkc;
+
+// blk_k = blk_kc / c
+--:-:-:-:1      XMAD    blk_k,  blk_kc, param_magic_c, RZ;
+--:-:-:-:1      SHR.U32 blk_k,  blk_k,  param_shift_c;
+// blk_c = blk_kc % c
+--:-:-:-:1      XMAD.S16.U16 blk_c, neg_c, blk_k, blk_kc;
+
+// blk_K = blk_K*param_k + blk_k
+--:-:-:-:1      XMAD blk_K, blk_K, param_k, blk_k;
+// blk_C = blk_C*param_c + blk_c
+--:-:-:-:1      XMAD blk_C, blk_C, param_c, blk_c;
+
+// Spill these block constants to shared
+--:-:-:-:1      ISETP.EQ.AND P5, PT, tid, RZ, PT;
+--:-:-:-:1  @P5 STS [addr_blk_K], blk_K;
+--:-:-:-:1  @P5 STS [addr_blk_C], blk_C;
+--:-:-:-:1  @P5 STS [addr_blk_P], blk_P;
+--:-:-:-:1  @P5 STS [addr_blk_Q], blk_Q;
+
+// gxs = blk_Q
+// gys = blk_P
+--:-:-:-:1      MOV gxs, blk_Q;
+--:-:-:-:1      MOV gys, blk_P;
+
+[+
+    our $IX;
+    return $IX ? '' : q{
+--:-:-:-:1      BFE.U32 n, tid, param_superN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+// tidX = (tid & 127) >> 2
+// tidY = tid & 3
+// writeS = tidY*512 + tidX + (tidY << 3)
+--:-:-:-:1      BFE.U32 tidX, tid, 0x502; // 5 bits at position 2
+--:-:-:-:1      LOP.AND tidY, tid, 3;
+--:-:-:-:1      ISCADD writeS, tidY, tidX, 9;
+--:-:-:-:1      ISCADD writeS, tidY, writeS, 3;
+--:-:-:-:1      SHL    writeS, writeS,  2;
+    };
++]
+
+// readEs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readIs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid,   -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,  1;
+
+--:-:-:-:1      LOP.AND  tid1,   tid,    1;
+--:-:-:-:1      LOP.AND  readIs, tid,    8;
+--:-:-:-:1      SHR.U32  readIs, readIs, 2;
+--:-:-:-:1      LOP3.LUT readIs, readIs, tid16, tid1, 0xfe;
+--:-:-:-:1      SHL      readIs, readIs, 4;
+
+--:-:-:-:1      BFE.U32  readEs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readEs, readEs, tid16;
+--:-:-:-:1      ISCADD   readEs, readEs, 4x<512*4 + 32>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U ERROR_SETUP;
+
+[+
+    our ($IX, $dtype_shift);
+    return $IX ? qq{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV swapBuffer, 4x<(512*4 + 32)*2>;
+
+// tidY = (tid & 127) / 32
+--:-:-:-:1      BFE.U32 tidY, tid, 0x205; // 2 bits at position 5
+--:-:-:-:1      BFE.U32 n, tid, param_superNI;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+// writeS = (tidY*512 + (tid & 31)*4)*4
+--:-:-:-:1      LOP.AND tidX,   tid, 31;
+--:-:-:-:1      SHL     writeS, tidX, 4;
+--:-:-:-:1      ISCADD  writeS, tidY, writeS, 11;
+// offsetI = I + (tid & 31)*4
+--:-:-:-:1      LEA      offsetI0.CC, tidX, param_I[0],     1x<$dtype_shift + 2>;
+--:-:-:-:1      LEA.HI.X offsetI1,    tidX, param_I[1], RZ, 1x<$dtype_shift + 2>;
+
+</SCHEDULE_BLOCK>
+    } : '';
++]
+
+--:-:-:-:0      MOV blkC, blk_C;
+
+// IMAGE_SETUP
+--:-:-:-:5      CAL IMAGE_OFFSET;
+--:-:-:-:5      CAL IMAGE_LOAD;
+--:-:-:-:5      CAL IMAGE_OFFSET;
+
+[+
+    our ($convert_in, $IX);
+    if ($convert_in)
+    {
+        my $out = $IX ? qq{
+02:-:-:-:1      $convert_in I03, I01.H1;
+--:-:-:-:1      $convert_in I02, I01.H0;
+--:-:-:-:1      $convert_in I01, I00.H1;
+--:-:2:-:1      $convert_in I00, I00.H0;
+
+04:-:-:-:1      $convert_in I13, I11.H1;
+--:-:-:-:1      $convert_in I12, I11.H0;
+--:-:-:-:1      $convert_in I11, I10.H1;
+--:-:3:-:1      $convert_in I10, I10.H0;
+
+08:-:-:-:1      $convert_in I23, I21.H1;
+--:-:-:-:1      $convert_in I22, I21.H0;
+--:-:-:-:1      $convert_in I21, I20.H1;
+--:-:4:-:1      $convert_in I20, I20.H0;
+
+10:-:-:-:1      $convert_in I33, I31.H1;
+--:-:-:-:1      $convert_in I32, I31.H0;
+--:-:-:-:1      $convert_in I31, I30.H1;
+--:-:5:-:1      $convert_in I30, I30.H0;
+        } : qq{
+02:-:-:-:1      $convert_in y0x0, y0x0;
+--:-:-:-:1      $convert_in y0x1, y0x1;
+--:-:-:-:1      $convert_in y0x2, y0x2;
+--:-:2:-:1      $convert_in y0x3, y0x3;
+
+04:-:-:-:1      $convert_in y2x0, y2x0;
+--:-:-:-:1      $convert_in y2x1, y2x1;
+--:-:-:-:1      $convert_in y2x2, y2x2;
+--:-:3:-:1      $convert_in y2x3, y2x3;
+
+08:-:-:-:1      $convert_in y1x0, y1x0;
+--:-:-:-:1      $convert_in y1x1, y1x1;
+--:-:-:-:1      $convert_in y1x2, y1x2;
+--:-:4:-:1      $convert_in y1x3, y1x3;
+
+10:-:-:-:1      $convert_in y3x0, y3x0;
+--:-:-:-:1      $convert_in y3x1, y3x1;
+--:-:-:-:1      $convert_in y3x2, y3x2;
+--:-:5:-:1      $convert_in y3x3, y3x3;
+        };
+        return qq{
+<SCHEDULE_BLOCK>
+<ORDERED>
+$out
+</ORDERED>
+--:-:-:-:1      NOP; # we need 20 total conversions.  that's 4 short of instruction 2 cache lines
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+--:-:-:-:1      NOP;
+</SCHEDULE_BLOCK>
+        };
+    }
+    return '';
++]
+
+[+
+    our $IX;
+    return $IX ? q{
+02:-:-:-:1      STS.128 [writeS + 4x<00*4>], I0;
+04:-:-:-:1      STS.128 [writeS + 4x<32*4>], I1;
+08:-:-:-:1      STS.128 [writeS + 4x<64*4>], I2;
+10:-:-:-:1      STS.128 [writeS + 4x<96*4>], I3;
+
+// init = bNextY ? 1 : 0
+--:-:-:-:0      SEL pred_bits, RZ, 1, !P6;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS,  swapBuffer;
+--:-:-:-:0      IADD swapBuffer, RZ, -swapBuffer;
+
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>];
+
+--:-:-:-:5      CAL IMAGE_LOAD;
+
+// init += bNextY ? 1 : 0
+--:-:-:-:0  @P6 IADD pred_bits, pred_bits, 1;
+
+--:-:-:-:5      CAL IMAGE_OFFSET;
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+    } : q{
+<SCHEDULE_BLOCK>
+<ORDERED>
+06:-:-:-:1      FADD Y0X0, y0x0, -y2x0;
+--:-:-:-:1      FADD Y0X1, y0x1, -y2x1;
+--:-:-:-:1      FADD Y0X2, y0x2, -y2x2;
+--:-:-:-:1      FADD Y0X3, y0x3, -y2x3;
+--:-:-:-:1      FADD I00,  Y0X0, -Y0X2;
+--:-:-:-:1      FADD I03, -Y0X1,  Y0X3;
+--:-:-:-:1      FADD I01,  Y0X1,  Y0X2;
+--:-:-:-:1      FADD I02,  Y0X2, -Y0X1;
+--:-:-:-:1      STS [writeS + 4x<32*00>], I00;
+--:-:-:-:1      STS [writeS + 4x<32*03>], I03;
+--:-:-:-:1      STS [writeS + 4x<32*01>], I01;
+--:6:-:-:1      STS [writeS + 4x<32*02>], I02;
+18:-:-:-:1      FADD Y3X0, -y1x0, y3x0;
+--:-:-:-:1      FADD Y3X1, -y1x1, y3x1;
+--:-:-:-:1      FADD Y3X2, -y1x2, y3x2;
+--:-:-:-:1      FADD Y3X3, -y1x3, y3x3;
+--:-:-:-:1      FADD I12,  Y3X0, -Y3X2;
+--:-:-:-:1      FADD I15, -Y3X1,  Y3X3;
+--:-:-:-:1      FADD I13,  Y3X1,  Y3X2;
+--:-:-:-:1      FADD I14,  Y3X2, -Y3X1;
+--:-:-:-:1      STS [writeS + 4x<32*12>], I12;
+--:-:-:-:1      STS [writeS + 4x<32*15>], I15;
+--:-:-:-:1      STS [writeS + 4x<32*13>], I13;
+--:-:-:-:1      STS [writeS + 4x<32*14>], I14;
+20:-:-:-:1      FADD Y1X0, y1x0,  y2x0;
+--:-:-:-:1      FADD Y1X1, y1x1,  y2x1;
+--:-:-:-:1      FADD Y1X2, y1x2,  y2x2;
+--:-:-:-:1      FADD Y1X3, y1x3,  y2x3;
+--:-:-:-:1      FADD Y2X0, y2x0, -y1x0;
+--:-:-:-:1      FADD Y2X1, y2x1, -y1x1;
+--:-:-:-:1      FADD Y2X2, y2x2, -y1x2;
+--:-:-:-:1      FADD Y2X3, y2x3, -y1x3;
+--:-:-:-:1      FADD I04,  Y1X0, -Y1X2;
+--:-:-:-:1      FADD I05,  Y1X1,  Y1X2;
+--:-:-:-:1      FADD I06,  Y1X2, -Y1X1;
+--:-:-:-:1      FADD I07, -Y1X1,  Y1X3;
+--:-:-:-:1      STS [writeS + 4x<32*04>], I04;
+--:-:-:-:1      STS [writeS + 4x<32*05>], I05;
+--:-:-:-:1      STS [writeS + 4x<32*06>], I06;
+--:-:-:-:1      STS [writeS + 4x<32*07>], I07;
+--:-:-:-:1      FADD I08,  Y2X0, -Y2X2;
+--:-:-:-:1      FADD I11, -Y2X1,  Y2X3;
+--:-:-:-:1      FADD I09,  Y2X1,  Y2X2;
+--:-:-:-:1      FADD I10,  Y2X2, -Y2X1;
+--:-:-:-:1      STS [writeS + 4x<32*08>], I08;
+--:-:-:-:1      STS [writeS + 4x<32*11>], I11;
+--:-:-:-:1      STS [writeS + 4x<32*09>], I09;
+--:-:-:-:1      STS [writeS + 4x<32*10>], I10;
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+// init = bNextY ? 1 : 0
+--:-:-:-:0      SEL init, RZ, 1, !P6;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      IADD writeS, writeS, 4x<(512*4 + 32)*2>;
+
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>];
+
+--:-:-:-:5      CAL IMAGE_LOAD;
+
+// init += bNextY ? 1 : 0
+--:-:-:-:0  @P6 IADD init, init, 1;
+--:-:-:-:5      CAL IMAGE_OFFSET;
+--:-:-:-:0      BFI pred_bits, init, 0x214, pred_bits; // 2 bits at position 20
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+    };
++]
+
+
+IMAGE_OFFSET:
+
+<SCHEDULE_BLOCK>
+[+
+    our ($dtype_shift, $IX);
+    return $IX ? qq{
+
+--:-:-:-:1      BFE.U32 super_x, tid, param_superXI;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superYI;
+--:-:-:-:1      SHL gx, gxs, param_shiftXI;
+--:-:-:-:1      SHL gy, gys, param_shiftYI;
+--:-:-:-:1      IADD gx, gx, super_x;
+--:-:-:-:1      IADD gy, gy, super_y;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, gx, param_GX, P4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, gy, param_GY, P0;
+
+// offset = blkC*GY*GX*N + gy*GX*N + gx*N + n
+--:-:-:-:1      XMAD.U16.U16      offset, gx,   param_N,   n;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, gy,   param_XN,  offset;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, blkC, param_YXN, offset;
+
+// trackI = offsetI + offset*512
+20:-:-:-:1      LEA      trackI0.CC, offset, offsetI0,     1x<$dtype_shift + 9>;
+--:-:-:-:0      LEA.HI.X trackI1,    offset, offsetI1, RZ, 1x<$dtype_shift + 9>;
+    } : qq{
+// Calc superblock coordinates
+01:-:-:-:1      SHL x, gxs, param_shiftX;
+--:-:-:-:1      SHL y, gys, param_shiftY;
+
+// Calc this thread's sub-block coordinates
+--:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superY;
+--:-:-:-:1      ISCADD x, super_x,  x, 1;
+--:-:-:-:1      ISCADD y, super_y,  y, 1;
+
+// Apply padding
+--:-:-:-:1      IADD x, x, -param_pad_x;
+--:-:-:-:1      IADD y, y, -param_pad_y;
+
+// c = blkC*32 + tidX
+--:-:-:-:1      BFE.U32 tid_X, tid, 0x502; // 5 bits at position 2
+--:-:-:-:1      ISCADD c, blkC, tid_X, 5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, c, param_C, P4;
+
+// offset = c*YXN + y*XN + x*N + n
+--:-:-:-:1      XMAD.S16.U16      offset, x, param_N,   n;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, y, param_XN,  offset;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, c, param_YXN, offset;
+--:-:-:-:1      ISET.LT.AND offsign, offset, RZ, PT;
+
+20:-:-:-:1      LEA    track00.CC, offset,  param_I[0], $dtype_shift;
+--:-:-:-:1      IADD.X track01,    offsign, param_I[1];
+--:-:-:-:1      IADD   track10.CC, track00, param_Np;
+--:-:-:-:1      IADD.X track11,    track01, RZ;
+--:-:-:-:1      IADD   track20.CC, track10, param_Np;
+--:-:-:-:1      IADD.X track21,    track11, RZ;
+--:-:-:-:1      IADD   track30.CC, track20, param_Np;
+--:-:-:-:1      IADD.X track31,    track21, RZ;
+
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      IADD x2, x, 2;
+--:-:-:-:1      IADD x3, x, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  param_X, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_X, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_X, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_X, P4;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD x1, y, 1;
+--:-:-:-:1      IADD x2, y, 2;
+--:-:-:-:1      IADD x3, y, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, y,  param_Y, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_Y, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_Y, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_Y, P4;
+--:-:-:-:1      ISETP.GE.AND P0, PT, y,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+
+--:-:-:-:1      SEL pred_bits, mask_x, RZ, P0;
+--:-:-:-:1  \@P1 BFI pred_bits, mask_x, 0x404, pred_bits;
+--:-:-:-:1  \@P2 BFI pred_bits, mask_x, 0x408, pred_bits;
+--:-:-:-:1  \@P3 BFI pred_bits, mask_x, 0x40c, pred_bits;
+
+// Cache y preds in high bits
+--:-:-:-:1      P2R mask_y, PR, RZ, 0x0f;
+--:-:-:-:0      BFI pred_bits, mask_y, 0x410, pred_bits; // 4 bits at position 16
+    };
++]
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
+IMAGE_LOAD:
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+[+
+    our ($dtype, $dtype_shift, $IX, $vec_size, $dtype_size);
+    return $IX ? qq{
+
+--:-:2:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];
+--:-:3:-:1 \@!P0 LDS.U.$vec_size I1, [addr_zero];
+--:-:4:-:1 \@!P0 LDS.U.$vec_size I2, [addr_zero];
+--:-:5:-:1 \@!P0 LDS.U.$vec_size I3, [addr_zero];
+
+--:-:2:-:1  \@P0 LDG.E.CG.$vec_size I0, [trackI + 4x<00 * $dtype_size>];
+--:-:3:-:1  \@P0 LDG.E.CG.$vec_size I1, [trackI + 4x<32 * $dtype_size>];
+--:-:4:-:1  \@P0 LDG.E.CG.$vec_size I2, [trackI + 4x<64 * $dtype_size>];
+--:6:5:-:1  \@P0 LDG.E.CG.$vec_size I3, [trackI + 4x<96 * $dtype_size>];
+
+    } : qq{
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1      SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;
+
+--:-:-:-:1 \@!P0 MOV y0x0, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI$dtype y0x0, [track0];
+--:-:-:-:1 \@!P1 MOV y0x1, RZ;
+--:-:-:-:1  \@P1 LDG.E.CI$dtype y0x1, [track1];
+--:-:-:-:1 \@!P2 MOV y0x2, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI$dtype y0x2, [track2];
+--:-:-:-:1 \@!P3 MOV y0x3, RZ;
+--:6:2:-:1  \@P3 LDG.E.CI$dtype y0x3, [track3];
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1      SHF.L.U64 pred_bits, pred_bits, 4, pred_bits;
+20:-:-:-:1      IADD   track00.CC, track00, param_2XNp;
+--:-:-:-:1      IADD.X track01,    track01, RZ;
+--:-:-:-:1      IADD   track10.CC, track10, param_2XNp;
+--:-:-:-:1      IADD.X track11,    track11, RZ;
+--:-:-:-:1      IADD   track20.CC, track20, param_2XNp;
+--:-:-:-:1      IADD.X track21,    track21, RZ;
+--:-:-:-:1      IADD   track30.CC, track30, param_2XNp;
+--:-:-:-:1      IADD.X track31,    track31, RZ;
+
+--:-:-:-:1 \@!P0 MOV y2x0, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI$dtype y2x0, [track0];
+--:-:-:-:1 \@!P1 MOV y2x1, RZ;
+--:-:-:-:1  \@P1 LDG.E.CI$dtype y2x1, [track1];
+--:-:-:-:1 \@!P2 MOV y2x2, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI$dtype y2x2, [track2];
+--:-:-:-:1 \@!P3 MOV y2x3, RZ;
+--:6:3:-:1  \@P3 LDG.E.CI$dtype y2x3, [track3];
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1      SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;
+20:-:-:-:1      IADD   track00.CC, track00, -param_XNp;
+--:-:-:-:1      IADD.X track01,    track01, -RZ;
+--:-:-:-:1      IADD   track10.CC, track10, -param_XNp;
+--:-:-:-:1      IADD.X track11,    track11, -RZ;
+--:-:-:-:1      IADD   track20.CC, track20, -param_XNp;
+--:-:-:-:1      IADD.X track21,    track21, -RZ;
+--:-:-:-:1      IADD   track30.CC, track30, -param_XNp;
+--:-:-:-:1      IADD.X track31,    track31, -RZ;
+
+--:-:-:-:1 \@!P0 MOV y1x0, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI$dtype y1x0, [track0];
+--:-:-:-:1 \@!P1 MOV y1x1, RZ;
+--:-:-:-:1  \@P1 LDG.E.CI$dtype y1x1, [track1];
+--:-:-:-:1 \@!P2 MOV y1x2, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI$dtype y1x2, [track2];
+--:-:-:-:1 \@!P3 MOV y1x3, RZ;
+--:6:4:-:1  \@P3 LDG.E.CI$dtype y1x3, [track3];
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1      SHF.L.U64 pred_bits, pred_bits, 12, pred_bits;
+20:-:-:-:1      IADD   track00.CC, track00, param_2XNp;
+--:-:-:-:1      IADD.X track01,    track01, RZ;
+--:-:-:-:1      IADD   track10.CC, track10, param_2XNp;
+--:-:-:-:1      IADD.X track11,    track11, RZ;
+--:-:-:-:1      IADD   track20.CC, track20, param_2XNp;
+--:-:-:-:1      IADD.X track21,    track21, RZ;
+--:-:-:-:1      IADD   track30.CC, track30, param_2XNp;
+--:-:-:-:1      IADD.X track31,    track31, RZ;
+
+--:-:-:-:1 \@!P0 MOV y3x0, RZ;
+--:-:-:-:1  \@P0 LDG.E.CI$dtype y3x0, [track0];
+--:-:-:-:1 \@!P1 MOV y3x1, RZ;
+--:-:-:-:1  \@P1 LDG.E.CI$dtype y3x1, [track1];
+--:-:-:-:1 \@!P2 MOV y3x2, RZ;
+--:-:-:-:1  \@P2 LDG.E.CI$dtype y3x2, [track2];
+--:-:-:-:1 \@!P3 MOV y3x3, RZ;
+--:6:5:-:1  \@P3 LDG.E.CI$dtype y3x3, [track3];
+    };
++]
+</ORDERED>
+
+// Advance offset/preds
+--:-:-:-:1      IADD n, n, param_loopN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+--:-:-:-:1 @!P4 BFE.U32 n, tid, param_superNI;
+--:-:-:-:1 @!P4 IADD gxs, gxs, param_strideX;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, PT;
+
+--:-:-:-:1 @!P5 MOV  gxs, blk_Q;
+--:-:-:-:1 @!P5 IADD gys, gys, param_strideY;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n,   param_N,  P6;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
+ERROR_SETUP:
+
+[+
+    our $IX;
+    return $IX ? q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      BFE.U32 n, tid, param_superN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+// tidX = (tid & 127) >> 2
+// tidY = tid & 3
+// writeS = tidY*512 + tidX + (tidY << 3)
+--:-:-:-:1      BFE.U32 tidX, tid, 0x502; // 5 bits at position 2
+--:-:-:-:1      LOP.AND tidY, tid, 3;
+--:-:-:-:1      ISCADD writeS, tidY, tidX, 9;
+--:-:-:-:1      ISCADD writeS, tidY, writeS, 3;
+--:-:-:-:1      SHL    writeS, writeS,  2;
+</SCHEDULE_BLOCK>
+    } : '';
++]
+
+--:-:-:-:0      MOV blkK, blk_K;
+
+--:-:-:-:5      CAL ERROR_OFFSET;
+--:-:-:-:5      CAL ERROR_LOAD;
+--:-:-:-:5      CAL ERROR_OFFSET;
+
+<SCHEDULE_BLOCK>
+[+
+    our ($convert_in);
+    return $convert_in ? qq{
+<ORDERED>
+02:-:2:-:1      $convert_in p0q0, p0q0;
+04:-:3:-:1      $convert_in p0q1, p0q1;
+08:-:4:-:1      $convert_in p1q1, p1q1;
+10:-:5:-:1      $convert_in p1q0, p1q0;
+</ORDERED>
+    } : '';
++]
+
+<ORDERED>
+02:-:-:-:1      FMUL e0,  p0q0,  0.5;
+04:-:-:-:1      FFMA E01, p0q1,  0.5,  e0;
+--:-:-:-:1      FFMA E02, p0q1, -0.5,  e0;
+08:-:-:-:1      FMUL e1,  p1q1,  0.5;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*00 + 32>], E00;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*01 + 32>], E01;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*02 + 32>], E02;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*03 + 32>], E03;
+10:-:-:-:1      FFMA E13, p1q0,  0.5,  e1;
+--:-:-:-:1      FFMA E14, p1q0,  0.5, -e1;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*12 + 32>], E12;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*15 + 32>], E15;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*13 + 32>], E13;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*14 + 32>], E14;
+--:-:-:-:1      FFMA B0,  p1q0,  0.5,  e0;
+--:-:-:-:1      FFMA C0,  p1q0, -0.5,  e0;
+--:-:-:-:1      FFMA B1,  p0q1,  0.5,  e1;
+--:-:-:-:1      FFMA C1,  p0q1,  0.5, -e1;
+--:-:-:-:1      FMUL e2,  B0,  0.5;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*04 + 32>], E04;
+--:-:-:-:1      FMUL e3,  C0,  0.5;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*08 + 32>], E08;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*07 + 32>], E07;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*11 + 32>], E11;
+--:-:-:-:1      FFMA E05, B1,  0.5,  e2;
+--:-:-:-:1      FFMA E06, B1, -0.5,  e2;
+--:-:-:-:1      FFMA E09, C1,  0.5,  e3;
+--:-:-:-:1      FFMA E10, C1, -0.5,  e3;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*05 + 32>], E05;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*06 + 32>], E06;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*09 + 32>], E09;
+--:-:-:-:1      STS [writeS + 4x<512*4 + 32*10 + 32>], E10;
+</ORDERED>
+
+</SCHEDULE_BLOCK>
+
+// init = bNextY ? 1 : 0
+--:-:-:-:0      SEL init, RZ, 1, !P6;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      IADD writeS, writeS, 4x<(512*4 + 32)*2>;
+
+--:-:-:-:1      LDS.U.128 j0Iy0, [readIs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*512 + 00>];
+--:-:-:-:1      LDS.U.128 j0Iy4, [readIs + 4x<0*512 + 16>];
+--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*512 + 16>];
+
+--:-:-:-:5      CAL ERROR_LOAD;
+
+// init += bNextY ? 1 : 0
+--:-:-:-:0  @P6 IADD init, init, 1;
+--:-:-:-:5      CAL ERROR_OFFSET;
+--:-:-:-:0      BFI pred_bits, init, 0x208, pred_bits; // 2 bits at position 8
+--:-:-:-:5      BRA.U ERROR_LOOP;
+
+ERROR_OFFSET:
+
+<SCHEDULE_BLOCK>
+// Calc superblock coordinates
+01:-:-:-:1      SHL x, gxs, param_shiftX;
+--:-:-:-:1      SHL y, gys, param_shiftY;
+
+// Calc this thread's sub-block coordinates
+--:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superY;
+--:-:-:-:1      ISCADD x, super_x,  x, 1;
+--:-:-:-:1      ISCADD y, super_y,  y, 1;
+
+// k = blkK*32 + tidX  (have k share register with c)
+--:-:-:-:1      BFE.U32 tid_X, tid, 0x502; // 5 bits at position 2
+--:-:-:-:1      ISCADD c, blkK, tid_X, 5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, c, param_K, P4;
+
+// offset0 = k*PQN + y*QN + x*N + n
+// offset1 = offset0 + N
+// offset2 = offset0 + QN
+// offset3 = offset1 + QN
+--:-:-:-:1      XMAD.S16.U16      offset, x, param_N,   n;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, y, param_QN,  offset;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, c, param_PQN, offset;
+
+20:-:-:-:1      LEA    track00.CC, offset,  param_E[0], [+ dtype_shift() +];
+--:-:-:-:1      IADD.X track01,    RZ,      param_E[1];
+--:-:-:-:1      IADD   track10.CC, track00, param_Np;
+--:-:-:-:1      IADD.X track11,    track01, RZ;
+--:-:-:-:1      IADD   track20.CC, track00, param_QNp;
+--:-:-:-:1      IADD.X track21,    track01, RZ;
+--:-:-:-:1      IADD   track30.CC, track10, param_QNp;
+--:-:-:-:0      IADD.X track31,    track11, RZ;
+
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      IADD x2, y, 1;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_Q, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y,  param_P, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x2, param_P, P4;
+
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, y,  RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x2, RZ, P3;
+
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x03;
+--:-:-:-:1      P2R mask_y, PR, RZ, 0x0c;
+
+--:-:-:-:1      SEL pred_bits, mask_x, RZ, P2;
+--:-:-:-:1  @P3 BFI pred_bits, mask_x, 0x202, pred_bits; // 2 bits at position 2
+
+// Cache y preds in high bits
+--:-:-:-:0      BFI pred_bits, mask_y, 0x404, pred_bits; // 4 bits at position 4
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
+ERROR_LOAD:
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+--:-:-:-:1      R2P PR, pred_bits, 0x0f;
+--:-:-:-:1 @!P0 MOV p0q0, RZ;
+--:-:2:-:1  @P0 LDG.E.CI[+ dtype() +] p0q0, [track0];
+--:-:-:-:1 @!P1 MOV p0q1, RZ;
+--:-:3:-:1  @P1 LDG.E.CI[+ dtype() +] p0q1, [track1];
+--:-:-:-:1 @!P3 MOV p1q1, RZ;
+--:-:4:-:1  @P3 LDG.E.CI[+ dtype() +] p1q1, [track3];
+--:-:-:-:1 @!P2 MOV p1q0, RZ;
+--:6:5:-:1  @P2 LDG.E.CI[+ dtype() +] p1q0, [track2];
+
+</ORDERED>
+
+// Advance offset/preds
+--:-:-:-:1      IADD n, n, param_loopN;
+--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, PT;
+
+--:-:-:-:1 @!P4 BFE.U32 n, tid, param_superN;
+--:-:-:-:1 @!P4 IADD gxs, gxs, param_strideX;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, PT;
+
+--:-:-:-:1 @!P5 MOV  gxs, blk_Q;
+--:-:-:-:1 @!P5 IADD gys, gys, param_strideY;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n,   param_N,   P6;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
+
+
+IMAGE_LOOP:
+
+[+
+    our ($dtype, $dtype_shift, $dtype_size, $vec_size, $convert_in, $IX);
+    my %insert = (
+
+        $IX ? (
+
+            j0c8  => "--:-:-:-:1      ISETP.LT.AND P0, PT, gx, param_GX, P6;\n",
+            j0c20 => "--:-:-:-:1      ISETP.LT.AND P0, PT, gy, param_GY, P0;\n",
+
+            j1c10 => "20:-:-:-:1  \@P0 LEA      trackI0.CC, offset, offsetI0,     1x<$dtype_shift + 9>;\n",
+            j1c15 => "--:-:-:-:1  \@P0 LEA.HI.X trackI1,    offset, offsetI1, RZ, 1x<$dtype_shift + 9>;\n",
+
+            j1c32 => "02:2:-:-:1      STS.128 [writeS + 4x<00*4>], I0;\n",
+            j1c36 => "02:-:2:-:1  \@P0 LDG.E.CG.$vec_size I0, [trackI + 4x<00 * $dtype_size>];\n",
+            j1c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I0, [addr_zero];\n",
+
+            j1c56 => "04:3:-:-:1      STS.128 [writeS + 4x<32*4>], I1;\n",
+            j1c60 => "04:-:3:-:1  \@P0 LDG.E.CG.$vec_size I1, [trackI + 4x<32 * $dtype_size>];\n",
+            j1c62 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I1, [addr_zero];\n",
+
+
+            j2c32 => "08:4:-:-:1      STS.128 [writeS + 4x<64*4>], I2;\n",
+            j2c36 => "08:-:4:-:1  \@P0 LDG.E.CG.$vec_size I2, [trackI + 4x<64 * $dtype_size>];\n",
+            j2c38 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I2, [addr_zero];\n",
+
+
+            j2c56 => "10:5:-:-:1      STS.128 [writeS + 4x<96*4>], I3;\n",
+            j2c60 => "10:6:5:-:1  \@P0 LDG.E.CG.$vec_size I3, [trackI + 4x<96 * $dtype_size>];\n",
+            j2c62 => "--:-:-:-:1 \@!P0 LDS.U.$vec_size I3, [addr_zero];\n",
+
+            $convert_in ? (
+                j1c16 => "02:-:-:-:1      $convert_in I03, I01.H1;\n",
+                j1c20 => "--:-:-:-:1      $convert_in I02, I01.H0;\n",
+                j1c24 => "--:-:-:-:1      $convert_in I01, I00.H1;\n",
+                j1c28 => "--:-:2:-:1      $convert_in I00, I00.H0;\n",
+
+                j1c40 => "04:-:-:-:1      $convert_in I13, I11.H1;\n",
+                j1c44 => "--:-:-:-:1      $convert_in I12, I11.H0;\n",
+                j1c48 => "--:-:-:-:1      $convert_in I11, I10.H1;\n",
+                j1c52 => "--:-:3:-:1      $convert_in I10, I10.H0;\n",
+
+                j2c16 => "08:-:-:-:1      $convert_in I23, I21.H1;\n",
+                j2c20 => "--:-:-:-:1      $convert_in I22, I21.H0;\n",
+                j2c24 => "--:-:-:-:1      $convert_in I21, I20.H1;\n",
+                j2c28 => "--:-:4:-:1      $convert_in I20, I20.H0;\n",
+
+                j2c40 => "10:-:-:-:1      $convert_in I33, I31.H1;\n",
+                j2c44 => "--:-:-:-:1      $convert_in I32, I31.H0;\n",
+                j2c48 => "--:-:-:-:1      $convert_in I31, I30.H1;\n",
+                j2c52 => "--:-:5:-:1      $convert_in I30, I30.H0;\n",
+            ) : (),
+
+            j2c63 => "--:-:-:-:1      IADD n,      n,      param_loopN;\n" .
+                     "--:-:-:-:0      IADD offset, offset, param_loopN;\n".
+                     "--:-:-:-:5      BAR.SYNC 0;\n" .
+                     "--:-:-:-:1      IADD readIs, readIs, -swapBuffer;\n" .
+                     "--:-:-:-:1      IADD readEs, readEs, -swapBuffer;\n" .
+                     "--:-:-:-:1      IADD writeS, writeS,  swapBuffer;\n" .
+                     "--:-:-:-:1      IADD swapBuffer, RZ, -swapBuffer;\n",
+
+            j3c8  => "--:-:-:-:1      PSETP.OR.AND P4, PT, P5, P6, PT;\n",
+            j3c21 => "--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, P4;\n",
+
+            j3c34 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n",
+
+            j3c63 => "--:-:-:Y:5  \@P4 BRA.U IMAGE_LOOP;\n",
+
+        ) : (
+
+            $convert_in ? (
+                j0c37 => "02:-:-:-:1      $convert_in y0x0, y0x0;\n",
+                j0c41 => "--:-:-:-:1      $convert_in y0x1, y0x1;\n",
+                j0c45 => "--:-:-:-:1      $convert_in y0x2, y0x2;\n",
+                j0c49 => "--:-:2:-:1      $convert_in y0x3, y0x3;\n",
+
+                j0c53 => "04:-:-:-:1      $convert_in y2x0, y2x0;\n",
+                j0c57 => "--:-:-:-:1      $convert_in y2x1, y2x1;\n",
+                j0c61 => "--:-:-:-:1      $convert_in y2x2, y2x2;\n",
+                j1c1  => "--:-:3:-:1      $convert_in y2x3, y2x3;\n",
+
+                j1c5  => "08:-:-:-:1      $convert_in y1x0, y1x0;\n",
+                j1c10 => "--:-:-:-:1      $convert_in y1x1, y1x1;\n",
+                j1c14 => "--:-:-:-:1      $convert_in y1x2, y1x2;\n",
+                j1c16 => "--:-:4:-:1      $convert_in y1x3, y1x3;\n",
+
+                j1c21 => "10:-:-:-:1      $convert_in y3x0, y3x0;\n",
+                j1c23 => "--:-:-:-:1      $convert_in y3x1, y3x1;\n",
+                j1c27 => "--:-:-:-:1      $convert_in y3x2, y3x2;\n",
+                j1c29 => "--:-:5:-:1      $convert_in y3x3, y3x3;\n",
+            ) : (),
+
+            j1c22 => "06:-:-:-:1      FADD Y0X0, y0x0, -y2x0;\n" .
+                     "--:-:-:-:1      FADD Y0X1, y0x1, -y2x1;\n",
+
+            j1c24 => "--:-:-:-:1      FADD Y0X2, y0x2, -y2x2;\n" .
+                     "--:-:-:-:1      FADD Y0X3, y0x3, -y2x3;\n",
+
+            j1c28 => "--:-:-:-:1      FADD I00,  Y0X0, -Y0X2;\n" .
+                     "--:-:-:-:1      FADD I03, -Y0X1,  Y0X3;\n",
+            j1c30 => "--:-:-:-:1      FADD I01,  Y0X1,  Y0X2;\n" .
+                     "--:-:-:-:1      FADD I02,  Y0X2, -Y0X1;\n",
+
+            j1c31 => "--:-:-:-:1      STS [writeS + 4x<32*00>], I00;\n",
+            j1c33 => "--:-:-:-:1      STS [writeS + 4x<32*03>], I03;\n",
+            j1c35 => "--:-:-:-:1      STS [writeS + 4x<32*01>], I01;\n",
+            j1c37 => "--:2:-:-:1      STS [writeS + 4x<32*02>], I02;\n",
+
+            j1c39 => "18:-:-:-:1      FADD Y3X0, -y1x0, y3x0;\n" .
+                     "--:-:-:-:1      FADD Y3X1, -y1x1, y3x1;\n" .
+                     "--:-:-:-:1      FADD Y3X2, -y1x2, y3x2;\n" .
+                     "--:-:-:-:1      FADD Y3X3, -y1x3, y3x3;\n",
+
+            j1c43 => "--:-:-:-:1      FADD I12,  Y3X0, -Y3X2;\n" .
+                     "--:-:-:-:1      FADD I15, -Y3X1,  Y3X3;\n" .
+                     "--:-:-:-:1      FADD I13,  Y3X1,  Y3X2;\n" .
+                     "--:-:-:-:1      FADD I14,  Y3X2, -Y3X1;\n",
+
+            j1c44 => "--:-:-:-:1      STS [writeS + 4x<32*12>], I12;\n",
+            j1c46 => "--:-:-:-:1      STS [writeS + 4x<32*15>], I15;\n",
+            j1c48 => "--:-:-:-:1      STS [writeS + 4x<32*13>], I13;\n",
+            j1c50 => "--:-:-:-:1      STS [writeS + 4x<32*14>], I14;\n",
+
+            j1c52 => "--:-:-:-:1      R2P PR, pred_bits, 0x0f;\n" .
+                     "--:-:-:-:1      SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;\n",
+
+            j1c53 => "--:-:-:-:1  \@P6 ISET.LT.AND off_sign, offset, RZ, PT;\n" .
+                     "--:-:-:-:1  \@P6 LEA    track00.CC, offset,  param_I[0], $dtype_shift;\n",
+
+            j1c58 => "--:-:-:-:1  \@P6 IADD.X track01,    off_sign, param_I[1];\n" .
+                     "--:-:-:-:1  \@P6 IADD   track10.CC, track00, param_Np;\n",
+
+            j2c18 => "--:-:-:-:1      FADD Y1X0, y1x0,  y2x0;\n" .
+                     "--:-:-:-:1      FADD Y1X1, y1x1,  y2x1;\n" .
+                     "--:-:-:-:1      FADD Y1X2, y1x2,  y2x2;\n" .
+                     "--:-:-:-:1      FADD Y1X3, y1x3,  y2x3;\n" .
+                     "--:-:-:-:1      FADD Y2X0, y2x0, -y1x0;\n" .
+                     "--:-:-:-:1      FADD Y2X1, y2x1, -y1x1;\n" .
+                     "--:-:-:-:1      FADD Y2X2, y2x2, -y1x2;\n" .
+                     "--:-:-:-:1      FADD Y2X3, y2x3, -y1x3;\n" .
+                     "--:-:-:-:1      FADD I04,  Y1X0, -Y1X2;\n" .
+                     "--:-:-:-:1      FADD I05,  Y1X1,  Y1X2;\n" .
+                     "--:-:-:-:1      FADD I06,  Y1X2, -Y1X1;\n" .
+                     "--:-:-:-:1      FADD I07, -Y1X1,  Y1X3;\n",
+
+            j2c19 => "--:-:-:-:1      STS [writeS + 4x<32*04>], I04;\n",
+            j2c21 => "--:-:-:-:1      STS [writeS + 4x<32*05>], I05;\n",
+            j2c23 => "--:-:-:-:1      STS [writeS + 4x<32*06>], I06;\n",
+            j2c25 => "--:-:-:-:1      STS [writeS + 4x<32*07>], I07;\n",
+
+            j2c27 => "--:-:-:-:1  \@P6 IADD.X track11,    track01, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track20.CC, track10, param_Np;\n",
+
+            j2c31 => "--:-:-:-:1      FADD I08,  Y2X0, -Y2X2;\n" .
+                     "--:-:-:-:1      FADD I11, -Y2X1,  Y2X3;\n" .
+                     "--:-:-:-:1      FADD I09,  Y2X1,  Y2X2;\n" .
+                     "--:-:-:-:1      FADD I10,  Y2X2, -Y2X1;\n",
+
+            j2c32 => "--:-:-:-:1      STS [writeS + 4x<32*08>], I08;\n",
+            j2c34 => "--:-:-:-:1      STS [writeS + 4x<32*11>], I11;\n",
+            j2c36 => "--:-:-:-:1      STS [writeS + 4x<32*09>], I09;\n",
+            j2c38 => "--:-:-:-:1      STS [writeS + 4x<32*10>], I10;\n",
+
+            j2c40 => "--:-:-:-:1  \@P6 IADD.X track21,    track11, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track30.CC, track20, param_Np;\n",
+
+            j2c44 => "--:-:-:-:1      LOP.AND.NZ P4, RZ, pred_bits, 0x4000;\n" .
+                     "--:-:-:-:1      LOP.XOR pred_bits, pred_bits, 0x4000;\n",
+
+            j2c46 => "--:-:-:-:1  \@P6 IADD.X track31,    track21, RZ;\n" .
+                     "--:-:-:-:1      IADD n, n, param_loopN;\n" .
+                     "--:-:-:-:1      IADD offset, offset, param_loopN;\n",
+
+            j2c62 => "--:-:-:-:1  \@P4 MOV swapBuf,  4x<(512*4 + 32)*2>;\n" .
+                     "--:-:-:-:1 \@!P4 MOV swapBuf, -4x<(512*4 + 32)*2>;\n",
+
+            j2c63 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                     "--:-:-:-:0      IADD readIs, readIs, -swapBuf;\n" .
+                     "--:-:-:-:1 \@!P0 I2I.U32.U32 y0x0, RZ;\n" .
+                     "--:-:-:-:0      IADD readEs, readEs, -swapBuf;\n" .
+                     "--:-:-:-:1  \@P0 LDG.E.CI$dtype y0x0, [track0];\n" .
+                     "--:-:-:-:0      IADD writeS, writeS,  swapBuf;\n" .
+                     "--:-:-:-:1 \@!P1 I2I.U32.U32 y0x1, RZ;\n" .
+                     "--:-:-:-:1  \@P1 LDG.E.CI$dtype y0x1, [track1];\n",
+
+            j3c0  => "--:-:-:-:1 \@!P2 I2I.U32.U32 y0x2, RZ;\n",
+            j3c1  => "--:-:-:-:1  \@P2 LDG.E.CI$dtype y0x2, [track2];\n",
+            j3c2  => "--:-:-:-:1 \@!P3 I2I.U32.U32 y0x3, RZ;\n",
+            j3c3  => "--:6:2:-:1  \@P3 LDG.E.CI$dtype y0x3, [track3];\n" .
+                     "--:-:-:Y:8      R2P PR, pred_bits, 0x0f;\n" .
+                     "20:-:-:-:1  \@P6 IADD   track00.CC, track00, param_2XNp;\n" .
+                     "--:-:-:-:1      SHF.L.U64 pred_bits, pred_bits, 4, pred_bits;\n",
+
+            j3c7  => "--:-:-:-:1 \@!P0 I2I.U32.U32 y2x0, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track01,    track01, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track10.CC, track10, param_2XNp;\n",
+
+            j3c9  => "--:-:-:-:1      PSETP.OR.AND P4, PT, P5, P6, PT;\n",
+
+            j3c11 => "--:-:-:-:1  \@P0 LDG.E.CI$dtype y2x0, [track0];\n" .
+                     "--:-:-:-:0  \@P6 IADD.X track11,    track11, RZ;\n" .
+                     "--:-:-:-:1 \@!P1 I2I.U32.U32 y2x1, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track20.CC, track20, param_2XNp;\n",
+
+            j3c12 => "--:-:-:-:1  \@P1 LDG.E.CI$dtype y2x1, [track1];\n",
+
+            j3c16 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y2x2, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track21,    track21, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track30.CC, track30, param_2XNp;\n",
+
+            j3c17 => "--:-:-:-:1  \@P2 LDG.E.CI$dtype y2x2, [track2];\n",
+
+
+            j3c21 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y2x3, RZ;\n" .
+                     "--:-:-:-:2  \@P6 IADD.X track31,    track31, RZ;\n",
+
+            j3c22 => "--:6:3:-:1  \@P3 LDG.E.CI$dtype y2x3, [track3];\n" .
+                     "--:-:-:Y:8      R2P PR, pred_bits, 0x0f;\n" .
+                     "20:-:-:-:1  \@P6 IADD   track00.CC, track00, -param_XNp;\n" .
+                     "--:-:-:-:1      SHF.R.U64 pred_bits, pred_bits, 8, pred_bits;\n",
+
+            j3c23 => "--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, P4;\n",
+
+
+            j3c25 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y1x0, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track01,    track01, -RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track10.CC, track10, -param_XNp;\n",
+
+            j3c26 => "--:-:-:-:1  \@P0 LDG.E.CI$dtype y1x0, [track0];\n",
+
+            j3c30 => "--:-:-:-:1 \@!P1 I2I.U32.U32 y1x1, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track11,    track11, -RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track20.CC, track20, -param_XNp;\n",
+
+            j3c31 => "--:-:-:-:1  \@P1 LDG.E.CI$dtype y1x1, [track1];\n",
+
+            j3c33 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n",
+
+            j3c35 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y1x2, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track21,    track21, -RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track30.CC, track30, -param_XNp;\n",
+
+            j3c36 => "--:-:-:-:1  \@P2 LDG.E.CI$dtype y1x2, [track2];\n",
+
+            j3c40 => "--:-:-:-:1 \@!P3 I2I.U32.U32 y1x3, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track31,    track31, -RZ;\n",
+
+            j3c42 => "--:6:4:-:1  \@P3 LDG.E.CI$dtype y1x3, [track3];\n" .
+                     "--:-:-:Y:8      R2P PR, pred_bits, 0x0f;\n" .
+                     "20:-:-:-:1  \@P6 IADD   track00.CC, track00, param_2XNp;\n" .
+                     "--:-:-:-:1      SHF.L.U64 pred_bits, pred_bits, 12, pred_bits;\n",
+
+            j3c46 => "--:-:-:-:1 \@!P0 I2I.U32.U32 y3x0, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track01,    track01, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track10.CC, track10, param_2XNp;\n",
+
+            j3c47 => "--:-:-:-:1  \@P0 LDG.E.CI$dtype y3x0, [track0];\n",
+
+            j3c51 => "--:-:-:-:1 \@!P1 I2I.U32.U32 y3x1, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track11,    track11, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track20.CC, track20, param_2XNp;\n",
+
+            j3c52 => "--:-:-:-:1  \@P1 LDG.E.CI$dtype y3x1, [track1];\n",
+
+            j3c56 => "--:-:-:-:1 \@!P2 I2I.U32.U32 y3x2, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track21,    track21, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD   track30.CC, track30, param_2XNp;\n",
+
+            j3c57 => "--:-:-:-:1  \@P2 LDG.E.CI$dtype y3x2, [track2];\n",
+
+            j3c60 => "--:-:-:-:2 \@!P3 I2I.U32.U32 y3x3, RZ;\n" .
+                     "--:-:-:-:1  \@P6 IADD.X track31,    track31, RZ;\n",
+
+            j3c62 => "--:6:5:-:1  \@P3 LDG.E.CI$dtype y3x3, [track3];\n",
+
+            j3c63 => "--:-:-:Y:5  \@P4 BRA.U IMAGE_LOOP;\n",
+        )
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 3)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 4;
+        my $bankOffset = $IX ? 0 : 8;
+
+        my ($c0, $c2, $c4, $c6) = $j == 3 && !$IX ? (4,6,8,10) : (0,2,4,6);
+
+        $insert{"j${j}c$c0"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy0, [readIs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, $bankOffset;
+        $insert{"j${j}c$c2"} = sprintf "--:-:-:-:1      LDS.U.128 j%dEx0, [readEs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, 8;
+        $insert{"j${j}c$c4"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy4, [readIs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, $bankOffset;
+        $insert{"j${j}c$c6"} = sprintf "--:-:1:-:1      LDS.U.128 j%dEx4, [readEs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, 8;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1;
+
+            my $yield  = $j < 3 && $stall && ($c % 3 == 0) ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? $j == 2 && !$IX ? '03' : '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+[+
+    our $IX;
+    return $IX ? q{
+// Advance x offset/preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD gxs,    gxs,    param_strideX;
+--:-:-:-:1      IADD offset, offset, param_loopXI;
+
+01:-:-:-:1      BFE.U32 super_x, tid, param_superXI;
+--:-:-:-:1      SHL gx, gxs, param_shiftXI;
+
+--:-:-:-:1      BFE.U32 n, tid, param_superNI;
+</SCHEDULE_BLOCK>
+--:-:-:Y:d      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:0      IADD gx, gx, super_x;
+--:-:-:Y:5  @P5 BRA.U IMAGE_LOOP;
+
+// Advance y offset/preds
+--:-:-:-:1      IADD gys, gys, param_strideY;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n, param_N, P6;
+--:-:-:-:1      LDS gxs, [addr_blk_Q];
+--:-:-:-:1      BFE.U32 super_x, tid, param_superXI;
+--:-:-:-:1      PSETP.AND.AND P5, PT, PT, PT, PT;
+--:-:-:-:0      BFE.U32 super_y, tid, param_superYI;
+--:-:1:-:2      LDS blkC, [addr_blk_C];
+--:-:-:-:1      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+<SCHEDULE_BLOCK>
+01:-:-:-:1      SHL gx, gxs, param_shiftXI;
+--:-:-:-:1      SHL gy, gys, param_shiftYI;
+--:-:-:-:1      IADD gx, gx, super_x;
+--:-:-:-:1      IADD gy, gy, super_y;
+--:-:-:-:1      XMAD.U16.U16      offset, gx,   param_N,   n;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, gy,   param_XN,  offset;
+--:-:-:-:1      XMAD.U16.U16.LO2C offset, blkC, param_YXN, offset;
+</SCHEDULE_BLOCK>
+--:-:-:Y:5  @P6 BRA.U IMAGE_LOOP;
+
+// Set n to loop remaining times
+--:-:-:-:1      LOP.AND.NZ P5, init, pred_bits,  3;
+--:-:-:-:1      MOV nloop, param_loopN;
+--:-:-:-:1      MOV N,     param_N;
+--:-:-:Y:a      LOP.AND   pred_bits, pred_bits, ~3;
+--:-:-:-:0      VMAD.U16.U16 n, -init, nloop, N;
+--:-:-:Y:5  @P5 BRA.U IMAGE_LOOP;
+--:-:-:Y:5      BRA.U END_LOOP;
+    } : q{
+// Advance x offset/preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD gxs, gxs, param_strideX;
+--:-:-:-:1      IADD offset, offset, param_loopX;
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:1      SHL x, gxs, param_shiftX;
+01:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 n, tid, param_superN;
+--:-:-:-:1      ISCADD x, super_x,  x, 1;
+--:-:-:-:1      IADD x, x, -param_pad_x;
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      IADD x2, x, 2;
+--:-:-:-:1      IADD x3, x, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  param_X, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_X, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_X, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_X, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x0f;
+// Extract y + init + buffer bits
+--:-:-:-:1      BFE.U32 mask_y, pred_bits, 0x710;
+--:-:-:-:1      R2P PR, mask_y, 0x0f;
+--:-:-:-:1      SEL pred_bits, mask_x, RZ, P0;
+--:-:-:-:1  @P1 BFI pred_bits, mask_x, 0x404, pred_bits;
+--:-:-:-:1  @P2 BFI pred_bits, mask_x, 0x408, pred_bits;
+--:-:-:-:1  @P3 BFI pred_bits, mask_x, 0x40c, pred_bits;
+--:-:-:-:0      BFI pred_bits, mask_y, 0x710, pred_bits;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P5 BRA.U IMAGE_LOOP;
+
+// Advance y offset/preds
+--:-:-:-:1      IADD gys, gys, param_strideY;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n,  param_N, P6;
+--:-:-:-:1      LDS gxs, [addr_blk_Q];
+--:-:-:-:0      BFE.U32 init, pred_bits, 0x314;
+--:-:1:-:1      LDS blkC, [addr_blk_C];
+--:-:-:-:3      PSETP.AND.AND P5, PT, PT, PT, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+--:-:-:-:5      CAL IMAGE_OFFSET;
+--:-:-:-:0      BFI pred_bits, init, 0x314, pred_bits;
+--:-:-:Y:5  @P6 BRA.U IMAGE_LOOP;
+
+
+// Set n to loop remaining times
+--:-:-:-:1      SHR.U32 pred_bits, init, 2;
+--:-:-:-:1      MOV nloop, param_loopN;
+--:-:-:-:1      MOV N, param_N;
+--:-:-:Y:c      LOP.AND.NZ P5, init, init, 3;
+--:-:-:-:1      SHL pred_bits, pred_bits, 22;
+--:-:-:-:0      VMAD.U16.U16 n, -init, nloop, N;
+--:-:-:Y:5  @P5 BRA.U IMAGE_LOOP;
+--:-:-:Y:5      BRA.U END_LOOP;
+
+    };
++]
+
+
+ERROR_LOOP:
+
+[+
+    our ($dtype, $convert_in, $dtype_shift, $IX);
+    my %insert = (
+
+        $convert_in ? (
+            j1c13 => "02:-:2:-:1      $convert_in p0q0, p0q0;\n",
+            j1c17 => "04:-:3:-:1      $convert_in p0q1, p0q1;\n",
+            j1c21 => "08:-:4:-:1      $convert_in p1q1, p1q1;\n",
+            j1c25 => "10:-:5:-:1      $convert_in p1q0, p1q0;\n",
+        ) : (),
+
+        j1c23 => "02:-:-:-:1      FMUL e0,  p0q0, 0.5;\n",
+
+        j1c28 => "04:-:-:-:1      FFMA E01, p0q1,  0.5,  e0;\n" .
+                 "--:-:-:-:1      FFMA E02, p0q1, -0.5,  e0;\n",
+
+        j1c29 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*00 + 32>], E00;\n",
+        j1c31 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*01 + 32>], E01;\n",
+        j1c33 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*02 + 32>], E02;\n",
+        j1c35 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*03 + 32>], E03;\n",
+
+        j1c37 => "08:-:-:-:1      FMUL e1,  p1q1,  0.5;\n",
+
+        j1c42 => "10:-:-:-:1      FFMA E13, p1q0,  0.5,  e1;\n" .
+                 "--:-:-:-:1      FFMA E14, p1q0,  0.5, -e1;\n",
+
+        j1c43 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*12 + 32>], E12;\n",
+        j1c45 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*15 + 32>], E15;\n",
+        j1c47 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*13 + 32>], E13;\n",
+        j1c49 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*14 + 32>], E14;\n",
+
+        j1c51 => "--:-:-:-:1      FFMA B0,  p1q0,  0.5,  e0;\n" .
+                 "--:-:-:-:1      FFMA C0,  p1q0, -0.5,  e0;\n" .
+                 "--:-:-:-:1      FFMA B1,  p0q1,  0.5,  e1;\n" .
+                 "--:-:-:-:1      FFMA C1,  p0q1,  0.5, -e1;\n",
+
+        j2c9  => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*04 + 32>], E04;\n",
+        j2c11 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*08 + 32>], E08;\n",
+        j2c13 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*07 + 32>], E07;\n",
+        j2c15 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*11 + 32>], E11;\n",
+
+        j2c17 => "--:-:-:-:1      FMUL e2,  B0,  0.5;\n" .
+                 "--:-:-:-:1      FMUL e3,  C0,  0.5;\n",
+
+        j2c21 => "--:-:-:-:1      FFMA E05, B1,  0.5,  e2;\n" .
+                 "--:-:-:-:1      FFMA E06, B1, -0.5,  e2;\n" .
+                 "--:-:-:-:1      FFMA E09, C1,  0.5,  e3;\n" .
+                 "--:-:-:-:1      FFMA E10, C1, -0.5,  e3;\n",
+
+        j2c23 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*05 + 32>], E05;\n",
+        j2c25 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*06 + 32>], E06;\n",
+        j2c27 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*09 + 32>], E09;\n",
+        j2c29 => "--:-:-:-:1      STS [writeS + 4x<512*4 + 32*10 + 32>], E10;\n",
+
+        j2c32 => "--:-:-:-:1      R2P PR, pred_bits, 0x0f;\n" .
+                 "--:-:-:-:1  \@P6 LEA    track00.CC, offset,  param_E[0], $dtype_shift;\n",
+
+        j2c37 => "--:-:-:-:1  \@P6 IADD.X track01,    RZ,      param_E[1];\n" .
+                 "--:-:-:-:1  \@P6 IADD   track10.CC, track00, param_Np;\n",
+
+        j2c42 => "--:-:-:-:1  \@P6 IADD.X track11,    track01, RZ;\n" .
+                 "--:-:-:-:1  \@P6 IADD   track20.CC, track00, param_QNp;\n",
+
+        j2c44 => "--:-:-:-:1      LOP.AND.NZ P4, RZ, pred_bits, 0x400;\n" .
+                 "--:-:-:-:1      LOP.XOR pred_bits, pred_bits, 0x400;\n",
+
+        j2c47 => "--:-:-:-:1  \@P6 IADD.X track21,    track01, RZ;\n" .
+                 "--:-:-:-:1  \@P6 IADD   track30.CC, track10, param_QNp;\n",
+
+        j2c52 => "--:-:-:-:1  \@P6 IADD.X track31,    track11, RZ;\n",
+
+        j2c61 => "--:-:-:-:1  \@P4 MOV swapBuf,  4x<(512*4 + 32)*2>;\n" .
+                 "--:-:-:-:1 \@!P4 MOV swapBuf, -4x<(512*4 + 32)*2>;\n",
+
+        j2c62 => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readEs, readEs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD writeS, writeS,  swapBuf;\n",
+
+        j3c8  => "--:-:2:-:1  \@P0 LDG.E.CI$dtype p0q0, [track0];\n",
+        j3c10 => "--:-:3:-:1  \@P1 LDG.E.CI$dtype p0q1, [track1];\n",
+        j3c12 => "--:-:4:-:1  \@P3 LDG.E.CI$dtype p1q1, [track3];\n",
+        j3c14 => "--:6:5:-:1  \@P2 LDG.E.CI$dtype p1q0, [track2];\n",
+
+        j3c15 => "--:-:-:-:1      PSETP.OR.AND P4, PT, P5, P6, PT;\n" .
+                 "--:-:-:-:1      IADD n, n, param_loopN;\n" .
+                 "--:-:-:-:1      IADD offset, offset, param_loopN;\n",
+
+        j3c16 => "--:-:-:-:1 \@!P0 I2I.U32.U32 p0q0, RZ;\n",
+        j3c20 => "--:-:-:-:1 \@!P1 I2I.U32.U32 p0q1, RZ;\n",
+        j3c24 => "--:-:-:-:1 \@!P2 I2I.U32.U32 p1q0, RZ;\n",
+        j3c28 => "--:-:-:-:1 \@!P3 I2I.U32.U32 p1q1, RZ;\n",
+
+        j3c25 => "--:-:-:-:1      ISETP.LT.AND P4, PT, n, param_N, P4;\n",
+
+
+        j3c38 => "--:-:1:-:1 \@!P4 S2R tid, SR_TID.X;\n",
+
+
+        j3c63 => "--:-:-:Y:5  \@P4 BRA.U ERROR_LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 3)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 4;
+        my $bankOffset = $IX ? 0 : 8;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy0, [readIs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, $bankOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1      LDS.U.128 j%dEx0, [readEs + 4x<%d*(512 + %d) + 00>];\n", $nOdd, $rsOffset, 8;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1      LDS.U.128 j%dIy4, [readIs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, $bankOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1      LDS.U.128 j%dEx4, [readEs + 4x<%d*(512 + %d) + 16>];\n", $nOdd, $rsOffset, 8;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA|S2R)/ ? 0 : 1;
+
+            my $yield  = $j < 3 && $stall && ($c % 3 == 0) ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dIy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+// Advance x offset/preds
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD gxs, gxs, param_strideX;
+--:-:-:-:1      IADD offset, offset, param_loopX;
+// Extract y + init + buffer bits
+--:-:-:-:1      BFE.U32 mask_y, pred_bits, 0x704;
+--:-:-:-:1      R2P PR, mask_y, 0x0c;
+--:-:-:-:1      ISETP.LT.AND P5, PT, gxs, param_GXS, P6;
+--:-:-:-:1      SHL x, gxs, param_shiftX;
+01:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      ISCADD x, super_x,  x, 1;
+--:-:-:-:1      BFE.U32 n, tid, param_superN;
+--:-:-:-:1      IADD x1, x, 1;
+--:-:-:-:1      ISETP.LT.AND P0, PT, x,  param_Q, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_Q, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, x,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x03;
+--:-:-:-:1      SEL pred_bits, mask_x, RZ, P2;
+--:-:-:-:1  @P3 BFI pred_bits, mask_x, 0x202, pred_bits;
+--:-:-:-:0      BFI pred_bits, mask_y, 0x704, pred_bits;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:5  @P5 BRA.U ERROR_LOOP;
+
+// Advance y offset/preds
+--:-:-:-:1      IADD gys, gys, param_strideY;
+--:-:-:-:0      ISETP.LT.AND P4, PT, n,  param_N, P6;
+--:-:-:-:1      LDS gxs, [addr_blk_Q];
+--:-:-:-:0      BFE.U32 init, pred_bits, 0x308;
+--:-:1:-:1      LDS blkK, [addr_blk_K];
+--:-:-:-:2      PSETP.AND.AND P5, PT, PT, PT, PT;
+--:-:-:-:0      ISETP.LT.AND P6, PT, gys, param_GYS, PT;
+--:-:-:-:5      CAL ERROR_OFFSET;
+--:-:-:-:0      BFI pred_bits, init, 0x308, pred_bits;
+--:-:-:Y:5  @P6 BRA.U ERROR_LOOP;
+
+// Set n to loop remaining times
+--:-:-:-:1      SHR.U32 pred_bits, init, 2;
+--:-:-:-:1      MOV nloop, param_loopN;
+--:-:-:-:1      MOV N, param_N;
+--:-:-:Y:c      LOP.AND.NZ P5, init, init, 3;
+--:-:-:-:1      SHL pred_bits, pred_bits, 10;
+--:-:-:-:0      VMAD.U16.U16 n, -init, nloop, N;
+--:-:-:Y:5  @P5 BRA.U ERROR_LOOP;
+
+END_LOOP:
+
+// K_blk, C_blk, P_blk, Q_blk
+--:-:1:-:1      LDS.U.128 blkKCPQ, [addr_blk_K];
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+
+// Strip double buffering offsets, and the batch dimension on readIs
+// This gives us the shared memory write mapping for the thread's registers:
+// readEs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readIs = ((tid &  8) >> 2)  | (tid & 1)
+--:-:-:-:1      LOP.AND  tid_16,  tid,   -16;
+--:-:-:-:1      SHR.U32  tid_16,  tid_16,  1;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid,    1;
+--:-:-:-:1      LOP.AND  readIs, tid,    8;
+--:-:-:-:1      SHR.U32  readIs, readIs, 2;
+--:-:-:-:1      LOP.OR   readIs, readIs, tid_1;
+--:-:-:-:1      SHL      readIs, readIs, 4;
+
+--:-:-:-:1      BFE.U32  readEs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readEs, readEs, tid_16;
+--:-:-:-:1      SHL      readEs, readEs, 4;
+
+// writeCs = readIs * 512 + readEs;
+--:-:-:-:1      ISCADD  writeCs, readIs, readEs, 9;
+
+// readCs = tid//32 * 512 + tid & 31
+--:-:-:-:1      LOP.AND tid_31, tid, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid,  5;
+--:-:-:-:1      ISCADD  readCs, tid_32, tid_31, 9;
+--:-:-:-:1      SHL     readCs, readCs, 2;
+
+// kk = K_blk*32 + tid&31
+01:-:-:-:1      ISCADD  kk, K_blk, tid_31, 5;
+
+// cc = C_blk*32 + tid//32
+--:-:-:-:1      ISCADD  cc, C_blk, tid_32, 5;
+
+// F00 = c*RSK + r*SK + s*K + k
+--:-:-:-:1      XMAD.LO2C trackF, cc, param_RSK, kk;
+
+[+
+    our $determ;
+    if ($determ)
+    {
+        return q{
+--:-:-:-:1      MOV CRSK, param_CRSK;
+01:-:-:-:1      XMAD PQ_blk, P_blk,  param_strideX, Q_blk;
+--:-:-:-:1      XMAD.LO trackF, PQ_blk, CRSK, trackF, xmad_determ;
+        };
+    }
+    return '';
++]
+
+--:-:-:-:1      LEA      F00_0.CC, trackF, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X F00_1,    trackF, param_F[1], RZ, 2;
+
+--:-:-:-:1      MOV K1, param_K;
+--:-:-:-:1      SHL K1, K1, 2;
+
+--:-:-:-:1      MOV SK1, param_SK;
+--:-:-:-:1      SHL SK1, SK1, 2;
+
+--:-:-:-:1      MOV RSK8, param_RSK;
+--:-:-:-:1      SHL RSK8, RSK8, 5;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, kk, param_K, PT;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      IADD   F01_0.CC, F00_0, K1;
+--:-:-:-:1      IADD.X F01_1,    F00_1, RZ;
+--:-:-:-:6      IADD   F02_0.CC, F01_0, K1;
+--:-:-:-:1      IADD.X F02_1,    F01_1, RZ;
+
+--:-:-:-:6      IADD   F10_0.CC, F00_0, SK1;
+--:-:-:-:1      IADD.X F10_1,    F00_1, RZ;
+--:-:-:-:6      IADD   F11_0.CC, F01_0, SK1;
+--:-:-:-:1      IADD.X F11_1,    F01_1, RZ;
+--:-:-:-:6      IADD   F12_0.CC, F02_0, SK1;
+--:-:-:-:1      IADD.X F12_1,    F02_1, RZ;
+
+--:-:-:-:6      IADD   F20_0.CC, F10_0, SK1;
+--:-:-:-:1      IADD.X F20_1,    F10_1, RZ;
+--:-:-:-:6      IADD   F21_0.CC, F11_0, SK1;
+--:-:-:-:1      IADD.X F21_1,    F11_1, RZ;
+--:-:-:-:6      IADD   F22_0.CC, F12_0, SK1;
+--:-:-:-:1      IADD.X F22_1,    F12_1, RZ;
+
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*512 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*512 + 16>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*512 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*512 + 16>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:0      LOP.XOR readCs, readCs, 4x<8*512>;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*512 + 16>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*512 + 16>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*512 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*512 + 16>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*512 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*512 + 16>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:0      LOP.XOR readCs, readCs, 4x<8*512>;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:0      LOP.XOR readCs, readCs, 4x<8*512>;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+--:-:-:-:0      ISETP.LT.AND P1, PT, cc, param_C, P0; // cc < C && kk < K
+--:-:-:-:1      LDS m00, [readCs + 4x< 0*32>];
+--:-:-:-:1      LDS m10, [readCs + 4x< 4*32>];
+--:-:-:-:1      LDS m01, [readCs + 4x< 1*32>];
+--:-:1:-:1      LDS m11, [readCs + 4x< 5*32>];
+
+--:-:-:-:0      IADD cc, cc, 8;
+--:-:-:-:1      LDS m21, [readCs + 4x< 9*32>];
+--:-:-:-:1      LDS m02, [readCs + 4x< 2*32>];
+--:-:-:-:1      LDS m12, [readCs + 4x< 6*32>];
+--:-:2:-:1      LDS m22, [readCs + 4x<10*32>];
+
+--:-:-:-:1      LDS m31, [readCs + 4x<13*32>];
+--:-:-:-:1      LDS m20, [readCs + 4x< 8*32>];
+--:-:-:-:1      LDS m32, [readCs + 4x<14*32>];
+--:-:3:-:1      LDS m03, [readCs + 4x< 3*32>];
+
+--:-:-:-:1      LDS m13, [readCs + 4x< 7*32>];
+--:-:-:-:1      LDS m23, [readCs + 4x<11*32>];
+--:-:-:-:1      LDS m30, [readCs + 4x<12*32>];
+--:-:4:-:1      LDS m33, [readCs + 4x<15*32>];
+
+01:-:-:-:1      FADD t00, m00, m10;
+--:-:-:-:1      FADD t01, m01, m11;
+02:-:-:-:1      FADD t21, m11, m21;
+--:-:-:-:1      FADD t02, m02, m12;
+--:-:-:-:1      FADD t11, m11, -m21;
+--:-:-:-:1      FADD t22, m12, m22;
+--:-:-:-:1      FADD t12, m12, -m22;
+--:-:-:-:1      FADD t01, t01, m21;
+04:-:-:-:1      FADD t21, t21, m31;
+--:-:-:-:1      FADD t02, t02, m22;
+--:-:-:-:1      FADD t20, m10, m20;
+--:-:-:-:1      FADD t22, t22, m32;
+--:-:-:-:1      FADD t00, t00, m20;
+08:-:-:-:1      FADD t03, m03, m13;
+--:-:-:-:1      FADD t10, m10, -m20;
+--:-:-:-:1      FADD t23, m13, m23;
+--:-:-:-:1      FADD t20, t20, m30;
+--:-:-:-:1      FADD t13, m13, -m23;
+--:-:-:-:1      FADD f00, t00, t01;
+--:-:-:-:1      FADD t03, t03, m23;
+--:-:-:-:1      FADD f02, t01, t02;
+--:-:-:-:1      FADD t23, t23, m33;
+--:-:-:-:1      FADD f10, t10, t11;
+--:-:-:-:1      FADD f12, t11, t12;
+--:-:-:-:1      FADD f20, t20, t21;
+--:-:-:-:1      FADD f22, t21, t22;
+--:-:-:-:1      FADD f00, f00, t02;
+--:-:-:-:1      FADD f01, t01, -t02;
+--:-:-:-:0      FADD f02, f02, t03;
+--:-:-:-:1  @P1 [+ output_op() +] [F00_0], f00;
+--:-:-:-:0      FADD f10, f10, t12;
+--:-:-:-:1  @P1 [+ output_op() +] [F01_0], f01;
+--:-:-:-:0      FADD f11, t11, -t12;
+--:1:-:-:1  @P1 [+ output_op() +] [F02_0], f02;
+--:-:-:-:0      FADD f12, f12, t13;
+--:-:-:-:1  @P1 [+ output_op() +] [F10_0], f10;
+--:-:-:-:0      FADD f20, f20, t22;
+--:-:-:-:1  @P1 [+ output_op() +] [F11_0], f11;
+--:-:-:-:0      FADD f21, t21, -t22;
+--:2:-:-:1  @P1 [+ output_op() +] [F12_0], f12;
+--:-:-:-:0      FADD f22, f22, t23;
+--:-:-:-:1  @P1 [+ output_op() +] [F20_0], f20;
+--:-:-:-:1  @P1 [+ output_op() +] [F21_0], f21;
+--:3:-:-:1  @P1 [+ output_op() +] [F22_0], f22;
+
+01:-:-:-:6      IADD   F00_0.CC, F00_0, RSK8;
+--:-:-:-:1      IADD.X F00_1,    F00_1, RZ;
+--:-:-:-:6      IADD   F01_0.CC, F01_0, RSK8;
+--:-:-:-:1      IADD.X F01_1,    F01_1, RZ;
+--:-:-:-:6      IADD   F02_0.CC, F02_0, RSK8;
+--:-:-:-:1      IADD.X F02_1,    F02_1, RZ;
+02:-:-:-:6      IADD   F10_0.CC, F10_0, RSK8;
+--:-:-:-:1      IADD.X F10_1,    F10_1, RZ;
+--:-:-:-:6      IADD   F11_0.CC, F11_0, RSK8;
+--:-:-:-:1      IADD.X F11_1,    F11_1, RZ;
+--:-:-:-:6      IADD   F12_0.CC, F12_0, RSK8;
+--:-:-:-:1      IADD.X F12_1,    F12_1, RZ;
+04:-:-:-:6      IADD   F20_0.CC, F20_0, RSK8;
+--:-:-:-:1      IADD.X F20_1,    F20_1, RZ;
+--:-:-:-:6      IADD   F21_0.CC, F21_0, RSK8;
+--:-:-:-:1      IADD.X F21_1,    F21_1, RZ;
+--:-:-:-:6      IADD   F22_0.CC, F22_0, RSK8;
+--:-:-:-:0      IADD.X F22_1,    F22_1, RZ;
+
+--:-:-:-:5      RET;
+
diff --git a/Kernel/Convolution/Pascal/xconv_winograd_3x3_4x4_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_3x3_4x4_32x32.sass
new file mode 100644
index 0000000..20e8a9d
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_winograd_3x3_4x4_32x32.sass
@@ -0,0 +1,1047 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our ($type, $D);
+our $determ = $D;
+our $convert_in   = $type eq 'h' ? 'F2F.F32.F16' :    '';
+our $convert_out  = $type eq 'h' ? 'F2F.F16.F32' :    '';
+our $dtype_shift  = $type eq 'h' ?           '1' :   '2';
+our $dtype_size   = $type eq 'h' ?           '2' :   '4';
+our $vec_size     = $type eq 'h' ?          '64' : '128';
+sub dtype_shift { return $dtype_shift; }
+sub vec_size    { return $vec_size;    }
+sub output_op   { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero   : 4x<32*36*2*4 + 64 + 0>
+    addr_rYXN   : 4x<32*36*2*4 + 64 + 4>
+    addr_iYXN   : 4x<32*36*2*4 + 64 + 5>
+    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>
+    addr_idx_C  : 4x<32*36*2*4 + 64 + 7>
+
+    param_F[0]         : c[0x0][0x140]
+    param_F[1]         : c[0x0][0x144]
+    param_I[0]         : c[0x0][0x148]
+    param_I[1]         : c[0x0][0x14c]
+    param_E[0]         : c[0x0][0x150]
+    param_E[1]         : c[0x0][0x154]
+    param_alpha        : c[0x0][0x158]
+    param_K            : c[0x0][0x15c]
+    param_C            : c[0x0][0x160]
+    param_k            : c[0x0][0x164]
+    param_c            : c[0x0][0x168]
+    param_kc           : c[0x0][0x16c]
+    param_magic_kc     : c[0x0][0x170]
+    param_shift_kc     : c[0x0][0x174]
+    param_magic_c      : c[0x0][0x178]
+    param_shift_c      : c[0x0][0x17c]
+    param_YXN2         : c[0x0][0x180]
+    param_sYXN         : c[0x0][0x184]
+    param_magic_sYXN   : c[0x0][0x188]
+    param_shift_sYXN   : c[0x0][0x18c]
+    param_stride_YXNp  : c[0x0][0x190]
+    param_YXN          : c[0x0][0x194]
+    param_YXN_1152     : c[0x0][0x198]
+    param_RSK          : c[0x0][0x19c]
+    param_CRSK         : c[0x0][0x1a0]
+    param_Kp           : c[0x0][0x1a4]
+    param_SKp          : c[0x0][0x1a8]
+    param_RSK15_SK2p   : c[0x0][0x1ac]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10 : clx<0-3>y0
+     7, 6,15,14 : clx<0-3>y1
+     1, 0, 9, 8 : clx<0-3>y2
+     5, 4,13,12 : clx<0-3>y3
+    19,18,27,26 : clx<0-3>y4
+    23,22,31,30 : clx<0-3>y5
+    17,16,25,24 : clx<0-3>y6
+    21,20,29,28 : clx<0-3>y7
+
+      32-43 : jl0Fx<0-3>, jl0Iy<0-7>
+      44-51 : jl1Fx<0-3>, jl1Iy<4-7>
+      36-39 : jl1Iy<0-3>
+
+      52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3>
+      88-89 : track<0-1>
+      90-91 ~ writeS
+
+      32-86 ~ idx_YXNkc, idx_K, idx_C, idx_YXN, div<1-3>, magic_kc, neg_kc, idx_kc, idx_k, idx_c, YXN2_idx, neg_sYXN, magic_sYXN, remainder, yxn, offset, offset2, tid32_2, tid1, tid31
+         87 = tid
+
+      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
+      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16
+
+
+     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
+     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
+     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
+    35,34,43,42,51,50,59,58 : ccx<0-7>y4
+    39,38,47,46,55,54,63,62 : ccx<0-7>y5
+    33,32,41,40,49,48,57,56 : ccx<0-7>y6
+    37,36,45,44,53,52,61,60 : ccx<0-7>y7
+
+      64-79 : jc0Fx<0-7>, jc0Iy<0-7>
+      80-91 : jc1Fx<4-7>, jc1Iy<0-7>
+      64-67 : jc1Fx<0-3>
+
+      64-86 ~ tid16, tid_1, tid128
+
+      92-95 ~ reduce_YXN, swapBuf, readFs, readIs
+
+
+      64-89 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxC, idxK, idxI, readFs2, readIs2, offsetF, k, CRSK, xmad_determ
+      86-89 : Out1<0-1>, Out2<0-1>
+      90-91 : Out0<0-1>
+      92-95 ~ alpha, writeCs, readCs, c
+
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+
+      84-85 ~ t<0-1>
+
+       3, 2,11,19,10,18 : m<0-5>0
+       1, 9, 0, 8,17,16 : m<0-5>1
+      27,26,25,24,64,65 : m<0-5>2
+      66,67,68,69,70,71 : m<0-5>3
+      72,73,74,75,76,77 : m<0-5>4
+      78,79,80,81,82,83 : m<0-5>5
+
+       3, 2,11 : w00, w10, w20
+       1, 9, 0 : w01, w11, w21
+      27,26,25 : w02, w12, w22
+      66,67,68 : w03, w13, w23
+      72,73,74 : w04, w14, w24
+      78,79,80 : w05, w15, w25
+
+      19,10,18,69,70,71 ~ s00, s10, s20
+       8,17,16,75,76,77 ~ s02, s12, s22
+      24,64,65,81,82,83 ~ s01, s11, s21
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV swapBuf, 4x<32*36*2*2>;
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+01:-:-:Y:d      ISETP.GE.AND P0, PT, tid, 128, PT;
+--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;
+
+##############################################################
+LOAD_SETUP:
+
+--:-:1:-:1      S2R idx_YXNkc, SR_CTAID.X;
+--:-:2:-:1      S2R idx_K,     SR_CTAID.Z;
+--:-:3:-:1      S2R idx_C,     SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]
+
+--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;
+
+// idx_YXN = idx_YXNkc / blk_kc
+--:-:-:-:1      MOV  magic_kc, param_magic_kc;
+--:-:-:-:1      IADD neg_kc, RZ, -param_kc;
+--:-:-:-:1      ISETP.NE.AND P2, PT, magic_kc, 1, PT;
+01:-:-:-:1  @P2 XMAD     div1, idx_YXNkc,    magic_kc,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, idx_YXNkc,    magic_kc.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, idx_YXNkc.H1, magic_kc.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, idx_YXNkc.H1, magic_kc,    div1;
+--:-:-:-:1  @P2 IADD3.RS idx_YXN, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  idx_YXN, idx_YXN,   param_shift_kc;
+--:-:-:-:1 @!P2 SHR.U32  idx_YXN, idx_YXNkc, param_shift_kc;
+
+// idx_kc = idx_YXNkc % blk_kc
+--:-:-:-:1      XMAD.LO2 idx_kc, neg_kc, idx_YXN, idx_YXNkc;
+
+// idx_k = idx_kc / blk_c
+// idx_c = idx_kc % blk_c
+--:-:-:-:1      XMAD    idx_k,  idx_kc, param_magic_c, RZ;
+--:-:-:-:1      SHR.U32 idx_k,  idx_k,  param_shift_c;
+--:-:-:-:1      XMAD    idx_c,  idx_k,  param_c, RZ;
+--:-:-:-:1      IADD    idx_c, -idx_c,  idx_kc;
+
+// idx_K = idx_K * blk_k + idx_k
+// idx_C = idx_C * blk_c + idx_c
+02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+04:-:-:-:1      XMAD idx_C, idx_C, param_c, idx_c;
+
+// reduce_YXN  = ceil((YXN2 - idx_YXN) / sYXN)
+--:-:-:-:1      IADD YXN2_idx, -idx_YXN, param_YXN2;
+--:-:-:-:1      IADD neg_sYXN, RZ, -param_sYXN;
+--:-:-:-:1      MOV  magic_sYXN, param_magic_sYXN;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_sYXN, 1, PT;
+--:-:-:-:1  @P3 XMAD     div1, YXN2_idx,    magic_sYXN,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, YXN2_idx,    magic_sYXN.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, YXN2_idx.H1, magic_sYXN.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, YXN2_idx.H1, magic_sYXN,    div1;
+--:-:-:-:1  @P3 IADD3.RS reduce_YXN, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  reduce_YXN, reduce_YXN, param_shift_sYXN;
+--:-:-:-:1 @!P3 SHR.U32  reduce_YXN, YXN2_idx,   param_shift_sYXN;
+
+--:-:-:-:1      XMAD.LO2  remainder, neg_sYXN, reduce_YXN, YXN2_idx;
+--:-:-:-:1      IMNMX.U32 remainder, remainder, 1, PT;
+--:-:-:-:1      IADD reduce_YXN, reduce_YXN, remainder;
+
+--:-:-:-:1  @P0 STS [addr_iYXN],  idx_YXN;
+--:-:-:-:1  @P0 STS [addr_idx_K], idx_K;
+--:-:-:-:1  @P0 STS [addr_idx_C], idx_C;
+--:6:-:-:1  @P0 STS [addr_rYXN],  reduce_YXN;
+
+// yxn = (tid & 63) >> 5
+--:-:-:-:1      BFE.U32 yxn, tid, 0x105; // 1 bit at position 5
+
+// offset = (idx_YXN + (reduce_YXN - 1)*sYXN)*2 + yxn
+--:-:-:-:1      IADD     offset,  reduce_YXN, -1;
+--:-:-:-:1      XMAD     offset2, offset,    param_sYXN, idx_YXN;
+--:-:-:-:1      XMAD.PSL offset2, offset.H1, param_sYXN, offset2;
+--:-:-:-:1      ISCADD   offset2, offset2,   yxn, 1;
+
+// P6 = offset < YXN
+--:-:-:-:1      ISETP.LT.AND P6, PT, offset2, param_YXN, PT;
+
+// P5 = reduce_YXN > 1
+--:-:-:-:1      ISETP.GT.AND P5, PT, reduce_YXN, 1, PT;
+
+--:-:-:-:1      LOP.AND  tid32_2,  tid,    -32;
+--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;
+
+// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
+--:-:-:-:1      BFE.U32 readFs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readFs, readFs, tid32_2;
+--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;
+
+// readIs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readIs, tid,    16;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      IADD3   readIs, readIs, tid1, tid32_2;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// writeS = (yxn*32*36 + (tid & 31)*4)*4
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      SHL writeS, tid31, 4;
+--:-:-:-:1      XMAD writeS, yxn, 4x<32*36>, writeS;
+
+// offset = offset*32*36 + tid31*4
+--:-:-:-:1      SHL tid31, tid31, 2;
+--:-:-:-:0      XMAD.LO2 offset, offset2, 1x<32*36>, tid31;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6  @P1 BRA.U FILTER_SETUP;
+
+##############################################################
+IMAGE_SETUP:
+
+<SCHEDULE_BLOCK>
+// (GC32,GY,GX,N,6,6,32)
+// offset += idx_C * YXN*32*36
+--:-:-:-:1      XMAD.LO2C offset, idx_C, param_YXN_1152, offset;
+
+--:-:-:-:1      LEA      track0.CC, offset, param_I[0],     [+ dtype_shift() +];
+--:-:-:-:0      LEA.HI.X track1,    offset, param_I[1], RZ, [+ dtype_shift() +];
+</SCHEDULE_BLOCK>
+
+--:-:-:-:6      BRA.U LOAD;
+
+##############################################################
+FILTER_SETUP:
+
+<SCHEDULE_BLOCK>
+// writeS += 32*36*2*4
+--:-:-:-:1      IADD writeS, writeS, 4x<32*36*2>;
+
+// (GK32,GY,GX,N,6,6,32)
+// offset += idx_K * YXN*32*36
+--:-:-:-:1      XMAD.LO2C offset, idx_K, param_YXN_1152, offset;
+
+--:-:-:-:1      LEA      track0.CC, offset, param_E[0],     [+ dtype_shift() +];
+--:-:-:-:2      LEA.HI.X track1,    offset, param_E[1], RZ, [+ dtype_shift() +];
+</SCHEDULE_BLOCK>
+
+##############################################################
+LOAD:
+
+20:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>];
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>];
+--:-:2:-:1  @P6 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T0, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T1, [addr_zero];
+--:-:2:-:1 @!P6 LDS.U.[+ vec_size() +] T2, [addr_zero];
+
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>];
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>];
+--:-:3:-:1  @P6 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T3, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T4, [addr_zero];
+--:-:3:-:1 @!P6 LDS.U.[+ vec_size() +] T5, [addr_zero];
+
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>];
+--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>];
+--:-:4:-:1  @P6 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T6, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T7, [addr_zero];
+--:-:4:-:1 @!P6 LDS.U.[+ vec_size() +] T8, [addr_zero];
+
+[+
+    our $convert_in;
+    return $convert_in ? q{
+
+02:-:-:-:1      F2F.F32.F16 T03, T01.H1;
+--:-:-:-:1      F2F.F32.F16 T02, T01.H0;
+--:-:-:-:1      F2F.F32.F16 T01, T00.H1;
+--:-:2:-:1      F2F.F32.F16 T00, T00.H0;
+
+--:-:-:-:1      F2F.F32.F16 T13, T11.H1;
+--:-:-:-:1      F2F.F32.F16 T12, T11.H0;
+--:-:-:-:1      F2F.F32.F16 T11, T10.H1;
+--:-:5:-:1      F2F.F32.F16 T10, T10.H0;
+
+--:-:-:-:1      F2F.F32.F16 T23, T21.H1;
+--:-:-:-:1      F2F.F32.F16 T22, T21.H0;
+--:-:-:-:1      F2F.F32.F16 T21, T20.H1;
+--:-:6:-:1      F2F.F32.F16 T20, T20.H0;
+
+02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
+
+04:-:-:-:1      F2F.F32.F16 T33, T31.H1;
+--:-:-:-:1      F2F.F32.F16 T32, T31.H0;
+--:-:-:-:1      F2F.F32.F16 T31, T30.H1;
+--:-:3:-:1      F2F.F32.F16 T30, T30.H0;
+
+10:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
+
+--:-:-:-:1      F2F.F32.F16 T43, T41.H1;
+--:-:-:-:1      F2F.F32.F16 T42, T41.H0;
+--:-:-:-:1      F2F.F32.F16 T41, T40.H1;
+--:-:5:-:1      F2F.F32.F16 T40, T40.H0;
+
+20:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
+
+--:-:-:-:1      F2F.F32.F16 T53, T51.H1;
+--:-:-:-:1      F2F.F32.F16 T52, T51.H0;
+--:-:-:-:1      F2F.F32.F16 T51, T50.H1;
+--:-:6:-:1      F2F.F32.F16 T50, T50.H0;
+
+04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
+
+08:-:-:-:1      F2F.F32.F16 T63, T61.H1;
+--:-:-:-:1      F2F.F32.F16 T62, T61.H0;
+--:-:-:-:1      F2F.F32.F16 T61, T60.H1;
+--:-:4:-:1      F2F.F32.F16 T60, T60.H0;
+
+10:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
+
+--:-:-:-:1      F2F.F32.F16 T73, T71.H1;
+--:-:-:-:1      F2F.F32.F16 T72, T71.H0;
+--:-:-:-:1      F2F.F32.F16 T71, T70.H1;
+--:-:5:-:1      F2F.F32.F16 T70, T70.H0;
+
+20:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
+
+--:-:-:-:1      F2F.F32.F16 T83, T81.H1;
+--:-:-:-:1      F2F.F32.F16 T82, T81.H0;
+--:-:-:-:1      F2F.F32.F16 T81, T80.H1;
+--:-:6:-:1      F2F.F32.F16 T80, T80.H0;
+
+08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
+10:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
+20:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
+
+    } : q{
+02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
+--:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
+--:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
+04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
+--:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
+--:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
+08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
+--:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
+--:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
+    };
++]
+
+--:-:-:-:0      IADD   track0.CC, track0, -param_stride_YXNp;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:1      LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>];
+--:-:1:-:1      LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>];
+
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>];
+--:-:2:-:1  @P5 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>];
+--:-:3:-:1  @P5 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>];
+--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>];
+--:6:4:-:1  @P5 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>];
+
+--:-:-:-:5      BRA.U LOAD_LOOP;
+
+##############################################################
+
+COMPUTE_SETUP:
+
+<SCHEDULE_BLOCK>
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+--:-:-:-:1      IADD tid128, tid, -128;
+
+// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readIs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,   1;
+
+--:-:-:-:1      BFE.U32  readFs, tid128, 0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readFs, readFs, tid16;
+--:-:-:-:1      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2>, 4;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
+--:-:-:-:1      LOP.AND  readIs, tid128, 8;
+--:-:-:-:1      SHR.U32  readIs, readIs, 2;
+--:-:-:-:1      IADD3    readIs, readIs, tid16, tid_1;
+--:-:-:-:0      ISCADD   readIs, readIs, 4x<32*4>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS reduce_YXN, [addr_rYXN];
+
+--:-:-:-:1      LDS.U.128 jc0Iy0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Fx0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Iy4, [readIs + 4x<0*32*36 + 16>];
+--:-:1:-:2      LDS.U.128 jc0Fx4, [readFs + 4x<0*32*36 + 16>];
+
+COMPUTE_LOOP:
+[+
+    my %insert = (
+
+        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, reduce_YXN, 1, PT;\n" .
+                 "--:-:-:-:1      IADD reduce_YXN, reduce_YXN, -1;\n",
+
+        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD readFs, readFs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,   -swapBuf;\n",
+
+        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        my $odd    = $j;
+        my $nOdd   = 1 - $j;
+        my $rsPred = $j == 1 ? '@P0' : '   ';
+        my $bar    = $j == 0 ? '2' : '-';
+
+        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIy4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFx4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIy0, [readIs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;
+
+        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dFx0, [readFs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;
+
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $yield  = $c % 10 == 0 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dFx%d, jc%dIy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+LOAD_LOOP:
+--:-:-:-:1      ISETP.GT.AND P0, PT, reduce_YXN, 1, PT;
+20:-:-:-:1      IADD track0.CC, track0, -param_stride_YXNp;
+--:-:-:-:1      ISETP.GT.AND P1, PT, reduce_YXN, 2, PT;
+--:-:-:-:1      IADD reduce_YXN, reduce_YXN, -1;
+[+
+    our ($vec_size, $dtype_size, $convert_in);
+    my %insert = (
+
+        j0c3 => "--:-:-:-:1      IADD.X track1, track1, -RZ;\n",
+
+        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Iy4, [readIs + 4x<1*32*36 + 16>];\n",
+        j0c2  => "--:-:-:-:1      LDS.U.128 jl1Fx0, [readFs + 4x<1*32*36 + 00>];\n",
+        j0c18 => "--:-:1:-:1      LDS.U.128 jl1Iy0, [readIs + 4x<1*32*36 + 00>];\n",
+
+        j1c12 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>];\n",
+        j1c14 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>];\n",
+        j1c16 => "--:-:1:-:1  \@P0 LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>];\n",
+
+        $convert_in ? (
+
+            j0c1  => "02:-:-:-:1      F2F.F32.F16 T03, T01.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T02, T01.H0;\n",
+            j0c4  => "--:-:-:-:1      F2F.F32.F16 T01, T00.H1;\n" .
+                     "--:-:2:-:1      F2F.F32.F16 T00, T00.H0;\n",
+
+            j0c5  => "--:-:-:-:1      F2F.F32.F16 T13, T11.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T12, T11.H0;\n",
+            j0c6  => "--:-:-:-:1      F2F.F32.F16 T11, T10.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T10, T10.H0;\n",
+
+            j0c7  => "--:-:-:-:1      F2F.F32.F16 T23, T21.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T22, T21.H0;\n",
+            j0c8  => "--:-:-:-:1      F2F.F32.F16 T21, T20.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T20, T20.H0;\n",
+
+            j0c9  => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n",
+            j0c10 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
+            j0c11 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",
+
+            j0c13 => "02:-:-:-:1  \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n",
+            j0c14 => "10:-:-:-:1  \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n",
+            j0c15 => "20:-:2:-:1  \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n",
+
+            j0c16 => "04:-:-:-:1      F2F.F32.F16 T33, T31.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T32, T31.H0;\n",
+            j0c17 => "--:-:-:-:1      F2F.F32.F16 T31, T30.H1;\n" .
+                     "--:-:3:-:1      F2F.F32.F16 T30, T30.H0;\n",
+
+            j0c19 => "--:-:-:-:1      F2F.F32.F16 T43, T41.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T42, T41.H0;\n",
+            j0c20 => "--:-:-:-:1      F2F.F32.F16 T41, T40.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T40, T40.H0;\n",
+
+            j0c21 => "--:-:-:-:1      F2F.F32.F16 T53, T51.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T52, T51.H0;\n",
+            j0c22 => "--:-:-:-:1      F2F.F32.F16 T51, T50.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T50, T50.H0;\n",
+
+            j0c23 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
+            j0c24 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
+            j0c25 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",
+
+            j0c27 => "04:-:-:-:1  \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n",
+            j0c28 => "10:-:-:-:1  \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n",
+            j0c29 => "20:-:3:-:1  \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n",
+
+            j0c30 => "08:-:-:-:1      F2F.F32.F16 T63, T61.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T62, T61.H0;\n",
+            j0c31 => "--:-:-:-:1      F2F.F32.F16 T61, T60.H1;\n" .
+                     "--:-:4:-:1      F2F.F32.F16 T60, T60.H0;\n",
+
+            j1c0  => "--:-:-:-:1      F2F.F32.F16 T73, T71.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T72, T71.H0;\n",
+            j1c1  => "--:-:-:-:1      F2F.F32.F16 T71, T70.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T70, T70.H0;\n",
+
+            j1c2  => "--:-:-:-:1      F2F.F32.F16 T83, T81.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T82, T81.H0;\n",
+            j1c3  => "--:-:-:-:1      F2F.F32.F16 T81, T80.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T80, T80.H0;\n",
+
+            j1c4  => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
+            j1c5  => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
+            j1c6  => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",
+
+            j1c8  => "08:-:-:-:1  \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n",
+            j1c9  => "10:-:-:-:1  \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n",
+            j1c10 => "20:6:4:-:1  \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n",
+
+        ) : (
+
+            j0c6  => "02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;\n",
+            j0c8  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
+            j0c10 => "--:2:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",
+
+            j0c12 => "02:-:-:-:1  \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n",
+            j0c14 => "--:-:-:-:1  \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n",
+            j0c16 => "--:-:2:-:1  \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n",
+
+            j0c20 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
+            j0c22 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
+            j0c24 => "--:3:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",
+
+            j0c26 => "04:-:-:-:1  \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n",
+            j0c28 => "--:-:-:-:1  \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n",
+            j0c30 => "--:-:3:-:1  \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n",
+
+            j1c0  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
+            j1c2  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
+            j1c4  => "--:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",
+
+            j1c6  => "08:-:-:-:1  \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n",
+            j1c8  => "--:-:-:-:1  \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n",
+            j1c10 => "--:6:4:-:1  \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n",
+        ),
+
+        j1c11 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c31 => "--:-:-:Y:5  \@P0 BRA.U LOAD_LOOP;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "01" : '--';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dFx%d, jl%dIy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+--:-:1:-:2      S2R Tid, SR_TID.X;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha16, param_alpha;
+
+01:-:-:-:1      LOP.AND  Tid32_2,  Tid,    -32;
+--:-:-:-:1      SHR.U32  Tid32_2,  Tid32_2, 2;
+
+// readIs = ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND Tid1,   Tid,    1;
+01:-:-:-:1      LOP.AND readIs, Tid,    16;
+--:-:-:-:1      SHR.U32 readIs, readIs, 3;
+--:-:-:-:1      IADD    readIs, readIs, Tid1;
+
+// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readIs << 2)
+--:-:-:-:1      BFE.U32 readFs, Tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readFs, readFs, Tid32_2;
+--:-:-:-:1      ISCADD  readFs, readIs, readFs, 2;
+
+--:-:-:-:1      SHL     readFs, readFs, 4;
+--:-:-:-:1      SHL     readIs, readIs, 3;
+
+// writeCs = readIs * 32*36 + readFs;
+--:-:-:-:1      XMAD write16Cs, readIs, 1x<32*36>, readFs;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y2, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y2, alpha16;
+--:-:-:-:4      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y3, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y3, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y6, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y6, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y7, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y7, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      EXIT;
+
+COMPUTE_FINISH:
+
+--:-:1:-:2      S2R tid_128, SR_TID.X;
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+01:-:-:-:1      IADD tid_128, tid_128, -128;
+
+--:-:-:-:1      ISETP.GE.AND P4, PT, tid_128, 256, PT;
+
+// readIs = ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  Tid_1,   tid_128, 1;
+--:-:-:-:1      LOP.AND  readIs2, tid_128, 8;
+--:-:-:-:1      SHR.U32  readIs2, readIs2, 2;
+--:-:-:-:1      IADD     readIs2, readIs2, Tid_1;
+
+// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readIs2 << 2)
+--:-:-:-:1      LOP.AND  tid_16,   tid_128, -16;
+--:-:-:-:1      SHR.U32  tid_16,   tid_16,   1;
+--:-:-:-:1      BFE.U32  readFs2,  tid_128,  0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readFs2,  readFs2,  tid_16;
+--:-:-:-:1      ISCADD   readFs2,  readIs2, readFs2, 2;
+
+--:-:-:-:1      ISCADD   readFs2, readFs2,  4x<32*4>, 4;
+--:-:-:-:1      SHL      readIs2, readIs2, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:0      XMAD writeCs, readIs2, 1x<32*36>, readFs2;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P4 BRA.U SKIP0;
+
+--:-:1:-:1      LDS idxK, [addr_idx_K];
+--:-:2:-:1      LDS idxC, [addr_idx_C];
+[+ our $determ; return $determ ? q{--:-:3:-:1      LDS idxI, [addr_iYXN];} : ''; +]
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND tid_31, tid_128, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid_128,  5;
+--:-:-:-:1      SHR.U32 tid_64, tid_128,  6;
+
+// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16
+--:-:-:-:1      XMAD   readCs, tid_32, 1x<32*36>, tid_31;
+--:-:-:-:1      ISCADD readCs, tid_64, readCs, 4;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// k = K_blk*32 + tid_31
+// c = C_blk*32 + tid_32<<1
+--:-:-:-:1      SHL tid_32, tid_32, 1;
+01:-:-:-:1      ISCADD  k, idxK, tid_31, 5;
+02:-:-:-:1      ISCADD  c, idxC, tid_32, 5;
+
+
+// offsetF = c*RSK + r*SK + s*K + k
+--:-:-:-:1      XMAD.LO2C offsetF, c, param_RSK, k;
+
+[+
+    our $determ;
+    return $determ ? q{
+--:-:-:-:1      MOV CRSK, param_CRSK;
+04:-:-:-:1      XMAD.LO offsetF, idxI, CRSK, offsetF, xmad_determ;
+    } : '';
++]
+
+--:-:-:-:1      LEA      Out00.CC, offsetF, param_F[0],     2;
+--:-:-:-:1      LEA.HI.X Out01,    offsetF, param_F[1], RZ, 2;
+
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K, PT;
+</SCHEDULE_BLOCK>
+
+SKIP0:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD     Out10.CC, Out00, param_Kp;
+--:-:-:-:1      IADD.X   Out11,    Out01, RZ;
+--:-:-:-:1      IADD     Out20.CC, Out10, param_Kp;
+--:-:-:-:1      IADD.X   Out21,    Out11, RZ;
+
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, ccx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, ccx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, ccx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP1;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+01:-:-:-:5      IADD   Out00.CC, Out00, param_SKp;
+--:-:-:-:1      IADD c, c, 1;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+
+SKIP1:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y1, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP2;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+01:-:-:-:5      IADD   Out00.CC, Out00, param_RSK15_SK2p;
+--:-:-:-:1      IADD c, c, 15;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_RSK15_SK2p;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_RSK15_SK2p;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+
+SKIP2:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y4, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y6, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP3;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+01:-:-:-:5      IADD   Out00.CC, Out00, param_SKp;
+--:-:-:-:1      IADD c, c, 1;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+
+SKIP3:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y5, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP4;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP4:
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+--:-:-:-:0      ISETP.LT.AND P1, PT, c, param_C, P0;
+
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $j == 5 ? $i + 1 : '-';
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+$w:-:-:-:1      FADD t0,   m1$i,  m2$i;
+$w:-:-:-:1      FADD t1,   m3$i,  m4$i;
+--:-:-:-:1      FADD m1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD m3$i, m3$i, -m4$i;
+--:-:-:-:1      FADD w0$i, m0$i,  t0;
+--:-:-:-:1      FADD w0$i, w0$i,  t1;
+--:-:-:-:1      FMUL w1$i, m1$i,  0.625;
+--:-:-:-:1      FFMA w1$i, m3$i,  1.5,      w1$i;
+--:-:-:-:1      FFMA w2$i, t1,    2.25,     m5$i;
+--:-:-:-:1      FFMA w2$i, t0,    0.390625, w2$i;
+        };
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (0 .. 2)
+    {
+        $out .= qq{
+--:-:-:-:1      FADD t0,     w${i}1,  w${i}2;
+--:-:-:-:1      FADD t1,     w${i}3,  w${i}4;
+--:-:-:-:1      FADD w${i}1, w${i}1, -w${i}2;
+--:-:-:-:1      FADD w${i}3, w${i}3, -w${i}4;
+--:-:-:-:1      FADD s${i}0, w${i}0,  t0;
+--:-:-:-:1      FADD s${i}0, s${i}0,  t1;
+--:-:-:-:1      FMUL s${i}1, w${i}1,  0.625;
+--:-:-:-:1      FFMA s${i}1, w${i}3,  1.5,      s${i}1;
+--:-:-:-:1      FFMA s${i}2, t1,      2.25,     w${i}5;
+--:-:-:-:1      FFMA s${i}2, t0,      0.390625, s${i}2;
+        };
+    }
+    return $out;
++]
+
+//--:-:1:-:1      I2F.F32.S32 temp, c;
+
+<ORDERED>
+--:1:-:-:1  @P1 [+ output_op() +] [Out0], s00;
+--:2:-:-:1  @P1 [+ output_op() +] [Out1], s01;
+--:3:-:-:1  @P1 [+ output_op() +] [Out2], s02;
+01:-:-:-:6      IADD   Out00.CC, Out00, param_SKp;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+</ORDERED>
+
+<ORDERED>
+--:1:-:-:1  @P1 [+ output_op() +] [Out0], s10;
+--:2:-:-:1  @P1 [+ output_op() +] [Out1], s11;
+--:3:-:-:1  @P1 [+ output_op() +] [Out2], s12;
+01:-:-:-:6      IADD   Out00.CC, Out00, param_SKp;
+--:-:-:-:1      IADD.X Out01,    Out01, RZ;
+02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
+--:-:-:-:1      IADD.X Out11,    Out11, RZ;
+04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
+--:-:-:-:1      IADD.X Out21,    Out21, RZ;
+</ORDERED>
+
+<ORDERED>
+--:1:-:-:1  @P1 [+ output_op() +] [Out0], s20;
+--:2:-:-:1  @P1 [+ output_op() +] [Out1], s21;
+--:3:-:-:1  @P1 [+ output_op() +] [Out2], s22;
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32.sass b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32.sass
new file mode 100644
index 0000000..d4b2941
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32.sass
@@ -0,0 +1,1237 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $type;
+our $dtype       = $type eq 'h' ?         'U16' :  '32';
+our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' :    '';
+our $convert_out = $type eq 'h' ? 'F2F.F16.F32' :    '';
+our $dshift      = $type eq 'h' ?           '1' :   '2';
+our $dsize       = $type eq 'h' ?           '2' :   '4';
+our $vsize       = $type eq 'h' ?          '64' : '128';
+sub dtype  { return $dtype;  }
+sub dsize  { return $dsize;  }
+sub dshift { return $dshift; }
+sub vsize  { return $vsize;  }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero   : 4x<32*36*2*4 + 64 + 0>
+    addr_idx_Y  : 4x<32*36*2*4 + 64 + 4>
+    addr_idx_X  : 4x<32*36*2*4 + 64 + 5>
+    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>
+
+    param_S[0]           : c[0x0][0x140]
+    param_S[1]           : c[0x0][0x144]
+    param_X[0]           : c[0x0][0x148]
+    param_X[1]           : c[0x0][0x14c]
+    param_O[0]           : c[0x0][0x150]
+    param_O[1]           : c[0x0][0x154]
+    param_I[0]           : c[0x0][0x158]
+    param_I[1]           : c[0x0][0x15c]
+    param_F[0]           : c[0x0][0x160]
+    param_F[1]           : c[0x0][0x164]
+    param_alpha          : c[0x0][0x168]
+    param_beta           : c[0x0][0x16c]
+    param_flags          : c[0x0][0x170]
+    param_C              : c[0x0][0x174]
+    param_K              : c[0x0][0x178]
+    param_N              : c[0x0][0x17c]
+    param_Y              : c[0x0][0x180]
+    param_W              : c[0x0][0x184]
+    param_YXN            : c[0x0][0x188]
+    param_XN             : c[0x0][0x18c]
+    param_Y2             : c[0x0][0x190]
+    param_GX             : c[0x0][0x194]
+    param_Xk             : c[0x0][0x198]
+    param_k              : c[0x0][0x19c]
+    param_magic_Xk       : c[0x0][0x1a0]
+    param_shift_Xk       : c[0x0][0x1a4]
+    param_magic_k        : c[0x0][0x1a8]
+    param_shift_k        : c[0x0][0x1ac]
+    param_P              : c[0x0][0x1b0]
+    param_Q              : c[0x0][0x1b4]
+    param_QN             : c[0x0][0x1b8]
+    param_PQN            : c[0x0][0x1bc]
+    param_PQN15          : c[0x0][0x1c0]
+    param_maskN          : c[0x0][0x1c4]
+    param_shiftX         : c[0x0][0x1c8]
+    param_shiftY         : c[0x0][0x1cc]
+    param_superX         : c[0x0][0x1d0]
+    param_superY         : c[0x0][0x1d4]
+    param_pad_x          : c[0x0][0x1d8]
+    param_pad_y          : c[0x0][0x1dc]
+    param_RSK            : c[0x0][0x1e0]
+    param_RSK2p          : c[0x0][0x1e4]
+    param_YXN2p          : c[0x0][0x1e8]
+    param_gridN          : c[0x0][0x1ec]
+    param_gridQN         : c[0x0][0x1f0]
+    param_gridPQN        : c[0x0][0x1f4]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10 : clx<0-3>y0
+     7, 6,15,14 : clx<0-3>y1
+     1, 0, 9, 8 : clx<0-3>y2
+     5, 4,13,12 : clx<0-3>y3
+    19,18,27,26 : clx<0-3>y4
+    23,22,31,30 : clx<0-3>y5
+    17,16,25,24 : clx<0-3>y6
+    21,20,29,28 : clx<0-3>y7
+
+      32-43 : jl0Ix<0-3>, jl0Fy<0-7>
+      44-51 : jl1Ix<0-3>, jl1Fy<4-7>
+      36-39 : jl1Fy<0-3>
+
+      32-43 ~ swapBuff
+
+      88-89 : track<0-1>
+      90-92 : writeS, pred30, pred36
+
+      // Image Transform
+      44-51 ~ ti<0-5>
+
+      52,53,54,56,57,55 : i<0-5>0
+      59,60,61,63,58,62 : i<0-5>1
+      66,67,68,64,65,69 : i<0-5>2
+      73,74,75,71,72,70 : i<0-5>3
+      87,82,83,85,86,84 : i<0-5>4
+      80,81,76,78,79,77 : i<0-5>5
+
+      52,53,54,56,57,55 : TI<0-5>0
+      59,60,61,63,58,62 : TI<0-5>1
+      66,67,68,64,65,69 : TI<0-5>2
+      73,74,75,71,72,70 : TI<0-5>3
+      87,82,83,85,86,84 : TI<0-5>4
+      80,81,76,78,79,77 : TI<0-5>5
+
+      52,53,54,56,57,55 : I<0-5>0
+      59,60,61,63,58,62 : I<0-5>1
+      66,67,68,64,65,69 : I<0-5>2
+      73,74,75,71,72,70 : I<0-5>3
+      87,82,83,85,86,84 : I<0-5>4
+      80,81,76,78,79,77 : I<0-5>5
+
+      // Filter Transform
+      44-47 ~ rcp6, rcp8, rcp12, rcp24
+
+      52,53,54 : f<0-2>0
+      55,56,57 : f<0-2>1
+      58,59,60 : f<0-2>2
+
+      61,62,63 : tf<0-2>0
+      64,65,66 : tf<0-2>1
+      67,68,69 : tf<0-2>2
+
+      70,71,72,73,74,54 : TF<0-5>0
+      76,77,78,79,80,57 : TF<0-5>1
+      82,83,84,85,86,60 : TF<0-5>2
+
+      61,64,48,49,50,51 : ff<0-5>0
+      52,53,55,56,58,59 : ff<0-5>1
+      61,64,48,49,50,51 : ff<0-5>2
+
+      70,71,72,73,74,54 : F<0-5>0
+      62,63,65,66,67,68 : F<0-5>1
+      52,53,55,56,58,59 : F<0-5>2
+      69,75,81,87,76,77 : F<0-5>3
+      61,64,78,79,80,57 : F<0-5>4
+      82,83,84,85,86,60 : F<0-5>5
+
+      32-39 ~ partialC, idx_K, idx_Y, idx_X
+      40-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, idx_Y2, idx_X2, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, tid31, gx, gy, c, kk, offset, sign, idx_N, nn, x<1-5>, mask_x, super_x, super_y, partC
+
+      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
+      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16
+
+
+     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
+     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
+     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
+    35,34,43,42,51,50,59,58 : ccx<0-7>y4
+    39,38,47,46,55,54,63,62 : ccx<0-7>y5
+    33,32,41,40,49,48,57,56 : ccx<0-7>y6
+    37,36,45,44,53,52,61,60 : ccx<0-7>y7
+
+      64-79 : jc0Ix<0-7>, jc0Fy<0-7>
+      80-91 : jc1Ix<4-7>, jc1Fy<0-7>
+      64-67 : jc1Ix<0-3>
+
+      64-86 ~ tid16, tid_1, tid128
+         92 = swapBuf
+
+         87 = tid
+      93-95 ~ C, readFs, readIs
+
+      64-85 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, z<1-3>, mask_q
+      86-95 ~ alpha, one, writeCs, readCs, k, preds, offsetO, bias, bsum_offset
+
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+
+      // t00 80      r00 78
+      // t10 m10     r01 w01
+      // t20 m20     r02 w02
+      // t30 m30     r03 w03
+      // w00 m00     s00 w00
+      // w30 m40     s01 w01
+      // w10 m10     s02 w02
+      // w20 m20     s03 w04
+
+      78 = t0<0-5>, r<0-3>0
+      79 = temp
+
+       3, 2,11,10,19,18 : m<0-5>0
+       1, 9, 0, 8,17,16 : m<0-5>1
+      27,26,25,24,64,65 : m<0-5>2
+                2,11,10 : t10, t20, t30
+                9, 0, 8 : t11, t21, t31
+               26,25,24 : t12, t22, t32
+             3, 2,11,19 : w00, w10, w20, w30
+             1, 9, 0,17 : w01, w11, w21, w31
+            27,26,25,64 : w02, w12, w22, w32
+
+      66,67,68,69,70,71 : m<0-5>3
+      72,73,74,75,76,77 : m<0-5>4
+       8,24,10,65,16,18 : m<0-5>5
+               67,68,69 : t13, t23, t33
+               73,74,75 : t14, t24, t34
+               24,10,65 : t15, t25, t35
+            66,67,68,70 : w03, w13, w23, w33
+            72,73,74,76 : w04, w14, w24, w34
+             8,24,10,16 : w05, w15, w25, w35
+
+                1,27,66 : r01, r02, r03
+                9,26,67 : r11, r12, r13
+                0,25,68 : r21, r22, r23
+               17,64,70 : r31, r32, r33
+             3, 1,27,72 : s00, s01, s02, s03
+             2, 9,26,73 : s10, s11, s12, s13
+            11, 0,25,74 : s20, s21, s22, s23
+            19,17,64,76 : s30, s31, s32, s33
+
+                  80-83 ~ xx<0-3>
+                  78-81 ~ sum<0-3>
+                  82-83 : Sum<0-1>
+                  84-85 : Out<0-1>
+
+             8,10,16,18 ~ b0<0-3>
+            24,65,66,67 ~ b1<0-3>
+            68,69,70,71 ~ b2<0-3>
+            75,77,78,79 ~ b3<0-3>
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV C,   param_C;
+--:-:1:-:2      S2R tid, SR_TID.X;
+01:-:-:-:0      ISETP.GE.AND P0, PT, tid, 128, PT;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+--:-:-:Y:c      LOP.AND partialC, C, 1;
+--:-:-:-:0      IADD C, C, partialC;
+--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;
+
+##############################################################
+LOAD_SETUP:
+
+--:-:1:-:1      S2R idx_YXk, SR_CTAID.X;
+--:-:2:-:1      S2R idx_K,   SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]
+
+--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;
+
+// idx_Y2   = idx_YXk / blk_Xk
+--:-:-:-:1      MOV  magic_Xk, param_magic_Xk;
+--:-:-:-:1      IADD negXk, RZ, -param_Xk;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_Xk, 1, PT;
+01:-:-:-:1  @P3 XMAD     div1, idx_YXk,    magic_Xk,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, idx_YXk,    magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, idx_YXk.H1, magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk,    div1;
+--:-:-:-:1  @P3 IADD3.RS idx_Y2, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  idx_Y2, idx_Y2,  param_shift_Xk;
+--:-:-:-:1 @!P3 SHR.U32  idx_Y2, idx_YXk, param_shift_Xk;
+
+// idx_Xk  = idx_YXk % blk_Xk
+--:-:-:-:1      XMAD.LO2 idx_Xk, negXk, idx_Y2, idx_YXk;
+
+// idx_X2   = idx_Xk / blk_k
+// idx_k   = idx_Xk % blk_k
+--:-:-:-:1      XMAD    idx_X2,  idx_Xk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_X2,  idx_X2, param_shift_k;
+--:-:-:-:1      XMAD    idx_k,   idx_X2, param_k, RZ;
+--:-:-:-:1      IADD    idx_k,  -idx_k,  idx_Xk;
+
+// idx_K = idx_K * blk_k + idx_k
+02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+// gx = x2
+// gy = y2 * 2
+--:-:-:-:1      MOV idx_X, idx_X2;
+--:-:-:-:1      SHL idx_Y, idx_Y2, 1;
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// if y2 != Y2:
+//     gy += (gx&1) ^ ((gx&2)>>1)
+//     gx /= 2
+--:-:-:-:1      ISETP.NE.AND P4, PT, idx_Y2, param_Y2, PT;
+--:-:-:-:1  @P4 LOP.AND x1, idx_X, 1;
+--:-:-:-:1  @P4 BFE.U32 x2, idx_X, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P4 LOP.XOR x1, x1, x2;
+--:-:-:-:1  @P4 IADD idx_Y, idx_Y, x1;
+--:-:-:-:1  @P4 SHR.U32 idx_X, idx_X, 1;
+
+// Scan backwards on odd rows
+// if y2 & 1:
+//     gx = gridX - gx - 1
+--:-:-:-:1      LOP.AND.NZ P5, RZ, idx_Y2, 1;
+--:-:-:-:1  @P5 IADD idx_X, -idx_X,  param_GX;
+--:-:-:-:1  @P5 IADD idx_X,  idx_X, -1;
+
+--:6:-:-:1  @P0 STS [addr_idx_Y], idx_Y;
+--:6:-:-:1  @P0 STS [addr_idx_X], idx_X;
+--:6:-:-:1  @P0 STS [addr_idx_K], idx_K;
+
+// x = gx << shiftX
+// y = gy << shiftY
+--:-:-:-:1      SHL gx, idx_X, param_shiftX;
+--:-:-:-:1      SHL gy, idx_Y, param_shiftY;
+
+// Distribute the 8|4|2|1 blocks of the super block among 4|8|16|32 threads each of the warp
+--:-:-:-:1      BFE.U32 super_x, tid, param_superX;
+--:-:-:-:1      BFE.U32 super_y, tid, param_superY;
+--:-:-:-:1      ISCADD gx, super_x,  gx, 2;
+--:-:-:-:1      ISCADD gy, super_y,  gy, 2;
+
+--:-:-:-:1      LOP.AND  tid32_2,  tid,   -32;
+--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid32_2;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD3   readFs, readFs, tid1, tid32_2;
+--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;
+
+// c = (tid & 32) >> 5
+--:-:-:-:1      BFE.U32 c, tid, 0x105; // 1 bits at position 5
+
+// writeS = c*32*36 + tid & 31
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      XMAD writeS, c, 1152, tid31;
+--:-:-:-:1      SHL writeS, writeS, 2;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P1 BRA.U FILTER_SETUP;
+
+##############################################################
+IMAGE_SETUP:
+
+--:-:1:-:1      S2R idx_N, SR_CTAID.Z;
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS [writeS + 4x<32*0>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*1>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*2>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*3>], RZ;
+
+// n = idx_N*32 + tid & maskN
+--:-:-:-:1      LOP.AND nn, tid, param_maskN;
+01:-:-:-:1      ISCADD  nn, idx_N, nn, 5;
+
+// n < N
+--:-:-:-:1      ISETP.LT.AND P6, PT, nn, param_N, PT;
+
+// Subtract off the padding
+--:-:-:-:1      IADD gx, gx, -param_pad_x;
+--:-:-:-:1      IADD gy, gy, -param_pad_y;
+
+// offset = c*YXN + y0*XN + x0*N + n;
+--:-:-:-:1      XMAD.S16.U16      offset, gx, param_N,   nn;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, gy, param_XN,  offset;
+--:-:-:-:1      XMAD.S16.U16.LO2C offset, c,  param_YXN, offset;
+--:-:-:-:1      ISET.LT.AND sign, offset, RZ, PT;
+
+--:-:-:-:1      LEA    track0.CC, offset, param_I[0], [+ dshift() +];
+--:-:-:-:1      IADD.X track1,    sign,   param_I[1];
+
+--:-:-:-:1      IADD x1, gx, 1;
+--:-:-:-:1      IADD x2, gx, 2;
+--:-:-:-:1      IADD x3, gx, 3;
+--:-:-:-:1      IADD x4, gx, 4;
+--:-:-:-:1      IADD x5, gx, 5;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, gx, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, x4, param_W, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, x5, param_W, PT;
+--:-:-:-:1      ISETP.GE.AND P0, PT, gx, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, x4, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, x5, RZ, P5;
+--:-:-:-:1      P2R mask_x, PR, RZ, 0x3f;
+
+--:-:-:-:1      IADD x1, gy, 1;
+--:-:-:-:1      IADD x2, gy, 2;
+--:-:-:-:1      IADD x3, gy, 3;
+--:-:-:-:1      IADD x4, gy, 4;
+--:-:-:-:1      IADD x5, gy, 5;
+--:-:-:-:1      ISETP.LT.AND P0, PT, gy, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P4, PT, x4, param_Y, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, x5, param_Y, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, gy, RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, x1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, x2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, x3, RZ, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, x4, RZ, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, x5, RZ, P5;
+
+--:-:-:-:1      SEL pred30, mask_x, RZ, P0;
+--:-:-:-:1  @P1 BFI pred30, mask_x, 0x606, pred30;
+--:-:-:-:1  @P2 BFI pred30, mask_x, 0x60c, pred30;
+--:-:-:-:1  @P3 BFI pred30, mask_x, 0x612, pred30;
+--:-:-:-:1  @P4 BFI pred30, mask_x, 0x618, pred30;
+--:-:-:-:1      SEL pred36, mask_x, RZ, P5;
+
+// P6 = c == partialC == 1
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, partialC, P6;
+--:-:-:-:1      XMAD     partC,    partialC, param_YXN,       RZ;
+--:-:-:-:1      XMAD.PSL partialC, partialC, param_YXN.H1, partC;
+--:-:-:-:1      SHL  partialC, partialC, [+ dshift() +];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30;
+20:-:-:-:1 @!P0 MOV i00, RZ;
+--:-:-:-:1 @!P1 MOV i01, RZ;
+--:-:-:-:1 @!P2 MOV i02, RZ;
+--:-:-:-:1 @!P3 MOV i03, RZ;
+--:-:-:-:1 @!P4 MOV i04, RZ;
+--:-:-:-:1 @!P5 MOV i05, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i04, [track + [+ dsize() +]x<0*$W*$N + 4*$N>];
+--:-:1:-:1  @P5 LDG.E.CI.[+ dtype() +] i05, [track + [+ dsize() +]x<0*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1 @!P0 MOV i10, RZ;
+--:-:-:-:1 @!P1 MOV i11, RZ;
+--:-:-:-:1 @!P2 MOV i12, RZ;
+--:-:-:-:1 @!P3 MOV i13, RZ;
+--:-:-:-:1 @!P4 MOV i14, RZ;
+--:-:-:-:1 @!P5 MOV i15, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i14, [track + [+ dsize() +]x<1*$W*$N + 4*$N>];
+--:-:2:-:1  @P5 LDG.E.CI.[+ dtype() +] i15, [track + [+ dsize() +]x<1*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1 @!P0 MOV i20, RZ;
+--:-:-:-:1 @!P1 MOV i21, RZ;
+--:-:-:-:1 @!P2 MOV i22, RZ;
+--:-:-:-:1 @!P3 MOV i23, RZ;
+--:-:-:-:1 @!P4 MOV i24, RZ;
+--:-:-:-:1 @!P5 MOV i25, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i24, [track + [+ dsize() +]x<2*$W*$N + 4*$N>];
+--:-:3:-:1  @P5 LDG.E.CI.[+ dtype() +] i25, [track + [+ dsize() +]x<2*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1 @!P0 MOV i30, RZ;
+--:-:-:-:1 @!P1 MOV i31, RZ;
+--:-:-:-:1 @!P2 MOV i32, RZ;
+--:-:-:-:1 @!P3 MOV i33, RZ;
+--:-:-:-:1 @!P4 MOV i34, RZ;
+--:-:-:-:1 @!P5 MOV i35, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i34, [track + [+ dsize() +]x<3*$W*$N + 4*$N>];
+--:-:4:-:1  @P5 LDG.E.CI.[+ dtype() +] i35, [track + [+ dsize() +]x<3*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred30, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P6 SHF.L.U64 pred30, pred30, 24, pred30;
+--:-:-:-:1 @!P0 MOV i40, RZ;
+--:-:-:-:1 @!P1 MOV i41, RZ;
+--:-:-:-:1 @!P2 MOV i42, RZ;
+--:-:-:-:1 @!P3 MOV i43, RZ;
+--:-:-:-:1 @!P4 MOV i44, RZ;
+--:-:-:-:1 @!P5 MOV i45, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i40, [track + [+ dsize() +]x<4*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i41, [track + [+ dsize() +]x<4*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i42, [track + [+ dsize() +]x<4*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i43, [track + [+ dsize() +]x<4*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i44, [track + [+ dsize() +]x<4*$W*$N + 4*$N>];
+--:-:5:-:1  @P5 LDG.E.CI.[+ dtype() +] i45, [track + [+ dsize() +]x<4*$W*$N + 5*$N>];
+
+--:-:-:-:1 @!P6 R2P PR, pred36, 0x3f;
+--:-:-:-:1  @P6 R2P PR,     RZ, 0x3f;
+--:-:-:-:1 @!P0 MOV i50, RZ;
+--:-:-:-:1 @!P1 MOV i51, RZ;
+--:-:-:-:1 @!P2 MOV i52, RZ;
+--:-:-:-:1 @!P3 MOV i53, RZ;
+--:-:-:-:1 @!P4 MOV i54, RZ;
+--:-:-:-:1 @!P5 MOV i55, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i50, [track + [+ dsize() +]x<5*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i51, [track + [+ dsize() +]x<5*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i52, [track + [+ dsize() +]x<5*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i53, [track + [+ dsize() +]x<5*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i54, [track + [+ dsize() +]x<5*$W*$N + 4*$N>];
+--:-:6:-:1  @P5 LDG.E.CI.[+ dtype() +] i55, [track + [+ dsize() +]x<5*$W*$N + 5*$N>];
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+3f:-:-:-:5      IADD   track0.CC, track0, -partialC;
+--:-:-:-:1      IADD   writeS,    writeS, 4x<32*36*2*2>;
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:5      BRA.U IMAGE_LOOP;
+
+##############################################################
+FILTER_SETUP:
+
+<SCHEDULE_BLOCK>
+// writeS += 32*36*2*4
+--:-:-:-:1      IADD writeS, writeS, 4x<32*36*2>;
+
+--:-:-:-:1      MOV swapBuf, 4x<32*36*2*2>;
+
+// P6 = c == partialC == 1
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, 1, PT;
+--:-:-:-:1      ISETP.EQ.AND P6, PT, c, partialC, P6;
+--:-:-:-:1      XMAD     partC,    partialC, param_RSK, RZ;
+--:-:-:-:1      XMAD.PSL partialC, partialC, param_RSK.H1, partC;
+--:-:-:-:1      SHL  partialC, partialC, [+ dshift() +];
+
+// k = idx_K*32 + tid & 31
+--:-:-:-:1      ISCADD  kk, idx_K, tid31,  5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, kk, param_K, !P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, kk, param_K, PT;
+
+// a0 = c*RSK + k
+--:-:-:-:1      XMAD.LO2C offset, c, param_RSK, kk;
+--:-:-:-:1      LEA      track0.CC, offset, param_F[0],     [+ dshift() +];
+--:-:-:-:1      LEA.HI.X track1,    offset, param_F[1], RZ, [+ dshift() +];
+
+--:-:-:-:1      STS [writeS + 4x<32*0>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*1>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*2>], RZ;
+--:-:-:-:1      STS [writeS + 4x<32*3>], RZ;
+
+20:-:-:-:1 @!P6 MOV f00, RZ;
+--:-:-:-:1 @!P6 MOV f01, RZ;
+--:-:-:-:1 @!P6 MOV f02, RZ;
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f00, [track + [+ dsize() +]x<0*3*$K + 0*$K>];
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f01, [track + [+ dsize() +]x<0*3*$K + 1*$K>];
+--:-:1:-:1  @P6 LDG.E.CI.[+ dtype() +] f02, [track + [+ dsize() +]x<0*3*$K + 2*$K>];
+
+--:-:-:-:1 @!P6 MOV f10, RZ;
+--:-:-:-:1 @!P6 MOV f11, RZ;
+--:-:-:-:1 @!P6 MOV f12, RZ;
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f10, [track + [+ dsize() +]x<1*3*$K + 0*$K>];
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f11, [track + [+ dsize() +]x<1*3*$K + 1*$K>];
+--:-:2:-:1  @P6 LDG.E.CI.[+ dtype() +] f12, [track + [+ dsize() +]x<1*3*$K + 2*$K>];
+
+--:-:-:-:1 @!P6 MOV f20, RZ;
+--:-:-:-:1 @!P6 MOV f21, RZ;
+--:-:-:-:1 @!P6 MOV f22, RZ;
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f20, [track + [+ dsize() +]x<2*3*$K + 0*$K>];
+--:-:-:-:1  @P6 LDG.E.CI.[+ dtype() +] f21, [track + [+ dsize() +]x<2*3*$K + 1*$K>];
+--:5:3:-:1  @P6 LDG.E.CI.[+ dtype() +] f22, [track + [+ dsize() +]x<2*3*$K + 2*$K>];
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+10:-:-:-:4      IADD   track0.CC, track0, -partialC;
+--:-:-:-:1      IADD   writeS, writeS, swapBuf;
+--:-:-:-:1      IADD   swapBuf, RZ, -swapBuf;
+--:-:-:-:0      IADD.X track1,    track1, -RZ;
+
+--:-:-:-:5      BRA.U FILTER_LOOP;
+
+##############################################################
+
+COMPUTE_SETUP:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV swapBuf, 4x<32*36*2*2>;
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+--:-:-:-:1      IADD tid128, tid, -128;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,   1;
+
+--:-:-:-:1      BFE.U32  readIs, tid128, 0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
+--:-:-:-:1      ISCADD   readIs, readIs, 4x<32*4 + 32*36*2*2>, 4;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
+--:-:-:-:1      LOP.AND  readFs, tid128, 8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      IADD3    readFs, readFs, tid16, tid_1;
+--:-:-:-:0      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2*3>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+// Let Load loop run once to transform initial load and store to shared.
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>];
+--:-:1:-:2      LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+COMPUTE_LOOP:
+[+
+    my %insert = (
+
+        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -2;\n",
+
+        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        my $odd    = $j;
+        my $nOdd   = 1 - $j;
+        my $rsPred = $j == 1 ? '@P0' : '   ';
+        my $bar    = $j == 0 ? '2' : '-';
+
+        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;
+
+        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;
+
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            #$stall = '4' if $stall && $c % 2 == 0 && $j == 0 && $c > 16;
+
+            my $yield  = $c % 5 == 0 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+IMAGE_LOOP:
+
+[+
+    our $convert_in; return $convert_in ? q{
+01:-:-:-:1      F2F.F32.F16 i00, i00;
+--:-:-:-:1      F2F.F32.F16 i01, i01;
+--:-:-:-:1      F2F.F32.F16 i02, i02;
+--:-:-:-:1      F2F.F32.F16 i03, i03;
+--:-:-:-:1      F2F.F32.F16 i04, i04;
+--:-:1:-:1      F2F.F32.F16 i05, i05;
+
+02:-:-:-:1      F2F.F32.F16 i10, i10;
+--:-:-:-:1      F2F.F32.F16 i11, i11;
+--:-:-:-:1      F2F.F32.F16 i12, i12;
+--:-:-:-:1      F2F.F32.F16 i13, i13;
+--:-:-:-:1      F2F.F32.F16 i14, i14;
+--:-:2:-:1      F2F.F32.F16 i15, i15;
+
+04:-:-:-:1      F2F.F32.F16 i20, i20;
+--:-:-:-:1      F2F.F32.F16 i21, i21;
+--:-:-:-:1      F2F.F32.F16 i22, i22;
+--:-:-:-:1      F2F.F32.F16 i23, i23;
+--:-:-:-:1      F2F.F32.F16 i24, i24;
+--:-:3:-:1      F2F.F32.F16 i25, i25;
+
+08:-:-:-:1      F2F.F32.F16 i30, i30;
+--:-:-:-:1      F2F.F32.F16 i31, i31;
+--:-:-:-:1      F2F.F32.F16 i32, i32;
+--:-:-:-:1      F2F.F32.F16 i33, i33;
+--:-:-:-:1      F2F.F32.F16 i34, i34;
+--:-:4:-:1      F2F.F32.F16 i35, i35;
+
+10:-:-:-:1      F2F.F32.F16 i40, i40;
+--:-:-:-:1      F2F.F32.F16 i41, i41;
+--:-:-:-:1      F2F.F32.F16 i42, i42;
+--:-:-:-:1      F2F.F32.F16 i43, i43;
+--:-:-:-:1      F2F.F32.F16 i44, i44;
+--:-:5:-:1      F2F.F32.F16 i45, i45;
+
+20:-:-:-:1      F2F.F32.F16 i50, i50;
+--:-:-:-:1      F2F.F32.F16 i51, i51;
+--:-:-:-:1      F2F.F32.F16 i52, i52;
+--:-:-:-:1      F2F.F32.F16 i53, i53;
+--:-:-:-:1      F2F.F32.F16 i54, i54;
+--:-:6:-:2      F2F.F32.F16 i55, i55;
+        } : '';
++]
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        my $w = $i == 0 ? '3f' : '--';
+        $out .= qq{
+$w:-:-:-:1      FFMA ti4,   i2$i, -2.640625,   i4$i;
+--:-:-:-:1      FFMA ti5,   i3$i, -2.640625,   i5$i;
+--:-:-:-:1      FFMA ti0,   i2$i, -2.25,       i4$i;
+--:-:-:-:1      FFMA ti1,   i1$i, -2.25,       i3$i;
+--:-:-:-:1      FFMA ti2,   i2$i, -0.390625,   i4$i;
+--:-:-:-:1      FFMA ti3,   i1$i, -0.390625,   i3$i;
+--:-:-:-:1      FFMA TI0$i, i0$i,  0.87890625, ti4;
+--:-:-:-:1      FFMA TI5$i, i1$i,  0.87890625, ti5;
+--:-:-:-:1      FFMA TI1$i, ti1,   0.625,      ti0;
+--:-:-:-:1      FFMA TI2$i, ti1,  -0.625,      ti0;
+--:-:-:-:1      FFMA TI3$i, ti3,   1.5,        ti2;
+--:-:-:-:1      FFMA TI4$i, ti3,  -1.5,        ti2;
+        };
+    }
+    return $out;
++]
+
+--:-:-:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;
+
+--:-:-:-:1      IADD   track0.CC, track0, param_YXN2p;
+--:-:-:-:1      IADD.X track1,    track1, RZ;
+
+//--:-:-:-:1      LOP32I.AND pred30, pred30, 0xffffff;
+--:-:-:-:1 @!P0 BFI pred36, RZ, 0x600, pred36;
+--:-:-:-:1 @!P0 MOV pred30, RZ;
+
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+
+[+
+    my $out;
+    foreach my $i (0 .. 5)
+    {
+        $out .= qq{
+--:-:-:-:1      FFMA ti4,    TI${i}2, -2.640625,   TI${i}4;
+--:-:-:-:1      FFMA ti5,    TI${i}3, -2.640625,   TI${i}5;
+--:-:-:-:1      FFMA ti0,    TI${i}2, -2.25,       TI${i}4;
+--:-:-:-:1      FFMA ti1,    TI${i}1, -2.25,       TI${i}3;
+--:-:-:-:1      FFMA ti2,    TI${i}2, -0.390625,   TI${i}4;
+--:-:-:-:1      FFMA ti3,    TI${i}1, -0.390625,   TI${i}3;
+--:-:-:-:1      FFMA I${i}0, TI${i}0,  0.87890625, ti4;
+--:-:-:-:1      FFMA I${i}5, TI${i}1,  0.87890625, ti5;
+--:-:-:-:1      FFMA I${i}1, ti1,      0.625,      ti0;
+--:-:-:-:1      FFMA I${i}2, ti1,     -0.625,      ti0;
+--:-:-:-:1      FFMA I${i}3, ti3,      1.5,        ti2;
+--:-:-:-:1      FFMA I${i}4, ti3,     -1.5,        ti2;
+        };
+    }
+    return $out;
++]
+<ORDERED>
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 3)>], I03;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 4)>], I04;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 0)>], I00;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 5)>], I05;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 1)>], I01;
+--:1:-:-:1      STS [writeS + 4x<32*(0*6 + 2)>], I02;
+
+
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 0)>], I10;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 5)>], I15;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 3)>], I13;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 1)>], I11;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 2)>], I12;
+--:2:-:-:1      STS [writeS + 4x<32*(1*6 + 4)>], I14;
+
+01:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i00, [track + [+ dsize() +]x<0*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i01, [track + [+ dsize() +]x<0*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i02, [track + [+ dsize() +]x<0*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i03, [track + [+ dsize() +]x<0*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i04, [track + [+ dsize() +]x<0*$W*$N + 4*$N>];
+--:-:1:-:1  @P5 LDG.E.CI.[+ dtype() +] i05, [track + [+ dsize() +]x<0*$W*$N + 5*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i00, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i01, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i02, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i03, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i04, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i05, RZ;
+--:-:-:-:1      R2P PR, pred30, 0x3f;
+
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 0)>], I20;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 5)>], I25;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 1)>], I21;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 2)>], I22;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 3)>], I23;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:3:-:-:1      STS [writeS + 4x<32*(2*6 + 4)>], I24;
+
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 0)>], I30;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 5)>], I35;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 1)>], I31;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 2)>], I32;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 3)>], I33;
+--:4:-:-:1      STS [writeS + 4x<32*(3*6 + 4)>], I34;
+
+02:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i10, [track + [+ dsize() +]x<1*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i11, [track + [+ dsize() +]x<1*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i12, [track + [+ dsize() +]x<1*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i13, [track + [+ dsize() +]x<1*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i14, [track + [+ dsize() +]x<1*$W*$N + 4*$N>];
+--:-:2:-:1  @P5 LDG.E.CI.[+ dtype() +] i15, [track + [+ dsize() +]x<1*$W*$N + 5*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i10, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i11, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i12, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i13, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i14, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i15, RZ;
+
+--:-:-:-:5      R2P PR, pred30, 0x3f; // FORCE
+
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 0)>], I40;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 5)>], I45;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 1)>], I41;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 2)>], I42;
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+<ORDERED>
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 3)>], I43;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 4)>], I44;
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+
+--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];
+--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];
+
+04:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i20, [track + [+ dsize() +]x<2*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i21, [track + [+ dsize() +]x<2*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i22, [track + [+ dsize() +]x<2*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i23, [track + [+ dsize() +]x<2*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i24, [track + [+ dsize() +]x<2*$W*$N + 4*$N>];
+--:-:3:-:1  @P5 LDG.E.CI.[+ dtype() +] i25, [track + [+ dsize() +]x<2*$W*$N + 5*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i20, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i21, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i22, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i23, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i24, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i25, RZ;
+--:-:-:-:6      R2P PR, pred30, 0x3f; // FORCE
+
+--:-:-:-:1      SHF.R.U64 pred30, pred30, 6, pred30;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 0)>], I50;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 5)>], I55;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 1)>], I51;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 2)>], I52;
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 3)>], I53;
+--:6:-:-:1      STS [writeS + 4x<32*(5*6 + 4)>], I54;
+
+08:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i30, [track + [+ dsize() +]x<3*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i31, [track + [+ dsize() +]x<3*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i32, [track + [+ dsize() +]x<3*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i33, [track + [+ dsize() +]x<3*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i34, [track + [+ dsize() +]x<3*$W*$N + 4*$N>];
+--:-:4:-:1  @P5 LDG.E.CI.[+ dtype() +] i35, [track + [+ dsize() +]x<3*$W*$N + 5*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i30, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i31, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i32, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i33, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i34, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i35, RZ;
+--:-:-:-:c      R2P PR, pred30, 0x3f; // FORCE
+
+--:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i40, [track + [+ dsize() +]x<4*$W*$N + 0*$N>];
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i41, [track + [+ dsize() +]x<4*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i42, [track + [+ dsize() +]x<4*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i43, [track + [+ dsize() +]x<4*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i44, [track + [+ dsize() +]x<4*$W*$N + 4*$N>];
+--:-:5:-:1  @P5 LDG.E.CI.[+ dtype() +] i45, [track + [+ dsize() +]x<4*$W*$N + 5*$N>];
+--:-:-:-:1      SHF.L.U64 pred30, pred30, 24, pred30;
+
+--:-:-:-:1 @!P0 I2I.U32.U32 i40, RZ;
+--:-:-:-:1 @!P1 I2I.U32.U32 i41, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i42, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i43, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i44, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i45, RZ;
+--:-:-:-:a      R2P PR, pred36, 0x3f; // FORCE
+
+20:-:-:-:1  @P0 LDG.E.CI.[+ dtype() +] i50, [track + [+ dsize() +]x<5*$W*$N + 0*$N>];
+--:-:-:-:1 @!P0 I2I.U32.U32 i50, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.[+ dtype() +] i51, [track + [+ dsize() +]x<5*$W*$N + 1*$N>];
+--:-:-:-:1  @P2 LDG.E.CI.[+ dtype() +] i52, [track + [+ dsize() +]x<5*$W*$N + 2*$N>];
+--:-:-:-:1  @P3 LDG.E.CI.[+ dtype() +] i53, [track + [+ dsize() +]x<5*$W*$N + 3*$N>];
+--:-:-:-:1  @P4 LDG.E.CI.[+ dtype() +] i54, [track + [+ dsize() +]x<5*$W*$N + 4*$N>];
+--:-:6:-:1  @P5 LDG.E.CI.[+ dtype() +] i55, [track + [+ dsize() +]x<5*$W*$N + 5*$N>];
+--:-:-:-:1 @!P1 I2I.U32.U32 i51, RZ;
+--:-:-:-:1 @!P2 I2I.U32.U32 i52, RZ;
+--:-:-:-:1 @!P3 I2I.U32.U32 i53, RZ;
+--:-:-:-:1 @!P4 I2I.U32.U32 i54, RZ;
+--:-:-:-:1 @!P5 I2I.U32.U32 i55, RZ;
+</ORDERED>
+
+<ORDERED>
+[+
+    our ($vsize, $dsize, $convert_in);
+    my %insert = (
+        j0c15 => "--:-:5:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 && $j == 1 ? "10" : '--';
+
+            my $ctrl   = "$wait:-:-:-:1";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+</ORDERED>
+--:-:-:-:1      LOP.AND.Z P0, RZ, pred36, 0x100;
+--:-:-:-:1      LOP.XOR pred36, pred36, 0x100;
+
+--:-:-:-:1      ISETP.GT.AND P1, PT, C, 0, PT;
+
+--:-:-:-:1  @P0 MOV32I swapBuff,  4x<32*36*2*2>;
+
+</SCHEDULE_BLOCK>
+--:-:-:-:1 @!P0 MOV32I swapBuff, -4x<32*36*2*2>;
+--:-:-:-:0      IADD C, C, -2;
+--:-:-:Y:5      BAR.SYNC 0;
+--:-:-:-:1      IADD readFs, readFs,  swapBuff;
+--:-:-:-:1      IADD readIs, readIs,  swapBuff;
+--:-:-:-:1      IADD writeS, writeS, -swapBuff;
+--:-:-:Y:5  @P1 BRA.U IMAGE_LOOP;
+--:-:-:Y:5      BRA.U LOAD_FINISH;
+
+FILTER_LOOP:
+
+[+
+    our $convert_in; return $convert_in ? q{
+01:-:-:-:1      F2F.F32.F16 f00, f00;
+--:-:-:-:1      F2F.F32.F16 f01, f01;
+--:-:1:-:1      F2F.F32.F16 f02, f02;
+
+02:-:-:-:1      F2F.F32.F16 f10, f10;
+--:-:-:-:1      F2F.F32.F16 f11, f11;
+--:-:2:-:1      F2F.F32.F16 f12, f12;
+
+04:-:-:-:1      F2F.F32.F16 f20, f20;
+--:-:-:-:1      F2F.F32.F16 f21, f21;
+--:-:3:-:1      F2F.F32.F16 f22, f22;
+        } : '';
++]
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV32I rcp6,  0.688403361344538;
+--:-:-:-:1      MOV32I rcp8,  0.430252100840336;
+--:-:-:-:1      MOV32I rcp24, 0.119514472455649;
+--:-:-:-:1      MOV32I rcp12, 0.179271708683473;
+07:-:-:-:1      FMUL32I tf00, f20, 0.26890756302521;
+--:-:-:-:1      FMUL32I tf01, f21, 0.26890756302521;
+--:-:-:-:1      FMUL32I tf02, f22, 0.26890756302521;
+--:-:-:-:1      FFMA tf10, f00, -rcp6, -tf00;
+--:-:-:-:1      FFMA tf20, f00,  rcp24, tf00;
+--:-:-:-:1      FFMA tf11, f01, -rcp6, -tf01;
+--:-:-:-:1      FFMA tf21, f01,  rcp24, tf01;
+--:-:-:-:1      FFMA tf12, f02, -rcp6, -tf02;
+--:-:-:-:1      FFMA tf22, f02,  rcp24, tf02;
+
+--:-:-:-:1      FMUL32I TF00, f00,  1.13777777777778;
+--:-:-:-:1      FFMA TF10, f10, -rcp8,  tf10;
+--:-:-:-:1      FFMA TF20, f10,  rcp8,  tf10;
+--:-:-:-:1      FFMA TF30, f10,  rcp12, tf20;
+--:-:-:-:1      FFMA TF40, f10, -rcp12, tf20;
+//--:-:-:-:1      MOV  TF50, f20;
+
+--:-:-:-:1      FMUL32I TF02, f02,  1.13777777777778;
+--:-:-:-:1      FFMA TF12, f12, -rcp8,  tf12;
+--:-:-:-:1      FFMA TF22, f12,  rcp8,  tf12;
+--:-:-:-:1      FFMA TF32, f12,  rcp12, tf22;
+--:-:-:-:1      FFMA TF42, f12, -rcp12, tf22;
+//--:-:-:-:1      MOV  TF52, f22;
+
+--:-:-:-:1      FMUL32I TF01, f01,  1.13777777777778;
+--:-:-:-:1      FFMA TF11, f11, -rcp8,  tf11;
+--:-:-:-:1      FFMA TF21, f11,  rcp8,  tf11;
+--:-:-:-:1      FFMA TF31, f11,  rcp12, tf21;
+--:-:-:-:1      FFMA TF41, f11, -rcp12, tf21;
+//--:-:-:-:1      MOV  TF51, f21;
+
+--:-:-:-:1      FMUL32I ff00, TF02, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff10, TF12, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff20, TF22, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff30, TF32, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff40, TF42, 0.26890756302521;
+--:-:-:-:1      FMUL32I ff50, TF52, 0.26890756302521;
+--:-:-:-:1      FFMA ff01, TF00, -rcp6, -ff00;
+--:-:-:-:1      FFMA ff02, TF00,  rcp24, ff00;
+--:-:-:-:1      FFMA ff11, TF10, -rcp6, -ff10;
+--:-:-:-:1      FFMA ff12, TF10,  rcp24, ff10;
+--:-:-:-:1      FFMA ff21, TF20, -rcp6, -ff20;
+--:-:-:-:1      FFMA ff22, TF20,  rcp24, ff20;
+--:-:-:-:1      FFMA ff31, TF30, -rcp6, -ff30;
+--:-:-:-:1      FFMA ff32, TF30,  rcp24, ff30;
+--:-:-:-:1      FFMA ff41, TF40, -rcp6, -ff40;
+--:-:-:-:1      FFMA ff42, TF40,  rcp24, ff40;
+--:-:-:-:1      FFMA ff51, TF50, -rcp6, -ff50;
+--:-:-:-:1      FFMA ff52, TF50,  rcp24, ff50;
+
+--:-:-:-:1      FMUL32I F00, TF00,  1.13777777777778;
+--:-:-:-:1      FFMA F01, TF01, -rcp8,  ff01;
+--:-:-:-:1      FFMA F02, TF01,  rcp8,  ff01;
+--:-:-:-:1      FFMA F03, TF01,  rcp12, ff02;
+--:-:-:-:1      FFMA F04, TF01, -rcp12, ff02;
+//--:-:-:-:1      MOV  F05, TF02;
+
+--:-:-:-:1      FMUL32I F10, TF10,  1.13777777777778;
+--:-:-:-:1      FFMA F11, TF11, -rcp8,  ff11;
+--:-:-:-:1      FFMA F12, TF11,  rcp8,  ff11;
+--:-:-:-:1      FFMA F13, TF11,  rcp12, ff12;
+--:-:-:-:1      FFMA F14, TF11, -rcp12, ff12;
+//--:-:-:-:1      MOV  F15, TF12;
+
+--:-:-:-:1      FMUL32I F20, TF20,  1.13777777777778;
+--:-:-:-:1      FFMA F21, TF21, -rcp8,  ff21;
+--:-:-:-:1      FFMA F22, TF21,  rcp8,  ff21;
+--:-:-:-:1      FFMA F23, TF21,  rcp12, ff22;
+--:-:-:-:1      FFMA F24, TF21, -rcp12, ff22;
+//--:-:-:-:1      MOV  F25, TF22;
+
+--:-:-:-:1      FMUL32I F30, TF30,  1.13777777777778;
+--:-:-:-:1      FFMA F31, TF31, -rcp8,  ff31;
+--:-:-:-:1      FFMA F32, TF31,  rcp8,  ff31;
+--:-:-:-:1      FFMA F33, TF31,  rcp12, ff32;
+--:-:-:-:1      FFMA F34, TF31, -rcp12, ff32;
+//--:-:-:-:1      MOV  F35, TF32;
+
+--:-:-:-:1      FMUL32I F40, TF40,  1.13777777777778;
+--:-:-:-:1      FFMA F41, TF41, -rcp8,  ff41;
+--:-:-:-:1      FFMA F42, TF41,  rcp8,  ff41;
+--:-:-:-:1      FFMA F43, TF41,  rcp12, ff42;
+--:-:-:-:1      FFMA F44, TF41, -rcp12, ff42;
+//--:-:-:-:1      MOV  F45, TF42;
+
+--:-:-:-:1      FMUL32I F50, TF50,  1.13777777777778;
+--:-:-:-:1      FFMA F51, TF51, -rcp8,  ff51;
+--:-:-:-:1      FFMA F52, TF51,  rcp8,  ff51;
+--:-:-:-:1      FFMA F53, TF51,  rcp12, ff52;
+--:-:-:-:1      FFMA F54, TF51, -rcp12, ff52;
+//--:-:-:-:1      MOV  F55, TF52;
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, P2;
+--:-:-:-:1      ISETP.GT.AND P1, PT, C, 0, PT;
+--:-:-:-:1      IADD C, C, -2;
+
+--:-:-:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:6:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+
+--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 5)>], F55;
+
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 0)>], F00;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 1)>], F01;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 2)>], F02;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 3)>], F03;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 4)>], F04;
+--:-:-:-:1      STS [writeS + 4x<32*(0*6 + 5)>], F05;
+
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 0)>], F10;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 1)>], F11;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 2)>], F12;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 3)>], F13;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 4)>], F14;
+--:-:-:-:1      STS [writeS + 4x<32*(1*6 + 5)>], F15;
+
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 0)>], F20;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 1)>], F21;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 2)>], F22;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 3)>], F23;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 4)>], F24;
+--:-:-:-:1      STS [writeS + 4x<32*(2*6 + 5)>], F25;
+
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 0)>], F30;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 1)>], F31;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 2)>], F32;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 3)>], F33;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 4)>], F34;
+--:-:-:-:1      STS [writeS + 4x<32*(3*6 + 5)>], F35;
+
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 0)>], F40;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 1)>], F41;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 2)>], F42;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 3)>], F43;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 4)>], F44;
+--:-:-:-:1      STS [writeS + 4x<32*(4*6 + 5)>], F45;
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+20:-:-:-:1      IADD   track0.CC, track0, param_RSK2p;
+--:-:-:-:1      IADD.X track1,    track1, RZ;
+<ORDERED>
+[+
+    our ($dtype, $dsize, $SK, $K);
+    my %insert = (
+        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n",
+        j0c1  => "--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n",
+        j0c15 => "--:-:5:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+
+        j0c5  => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 0)>], F50;\n",
+        j0c7  => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 1)>], F51;\n",
+        j0c9  => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 2)>], F52;\n",
+        j0c11 => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 3)>], F53;\n",
+        j0c13 => "--:-:-:-:1      STS [writeS + 4x<32*(5*6 + 4)>], F54;\n",
+
+        j1c1  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f00, [track + ${dsize}x<0*3*$K + 0*$K>];\n",
+        j1c2  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f01, [track + ${dsize}x<0*3*$K + 1*$K>];\n",
+        j1c3  => "--:-:1:-:1  \@P0 LDG.E.CI.$dtype f02, [track + ${dsize}x<0*3*$K + 2*$K>];\n",
+
+        j1c4  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f10, [track + ${dsize}x<1*3*$K + 0*$K>];\n",
+        j1c5  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f11, [track + ${dsize}x<1*3*$K + 1*$K>];\n",
+        j1c6  => "--:-:2:-:1  \@P0 LDG.E.CI.$dtype f12, [track + ${dsize}x<1*3*$K + 2*$K>];\n",
+
+        j1c7  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f20, [track + ${dsize}x<2*3*$K + 0*$K>];\n",
+        j1c8  => "--:-:-:-:1  \@P0 LDG.E.CI.$dtype f21, [track + ${dsize}x<2*3*$K + 1*$K>];\n",
+        j1c9  => "--:-:3:-:1  \@P0 LDG.E.CI.$dtype f22, [track + ${dsize}x<2*3*$K + 2*$K>];\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 && $j == 1 ? "10" : '--';
+
+            my $ctrl   = "$wait:-:-:-:1";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+</ORDERED>
+
+</SCHEDULE_BLOCK>
+--:-:-:-:1      IADD readFs, readFs, -swapBuf;
+--:-:-:-:1      IADD readIs, readIs, -swapBuf;
+--:-:-:-:0      IADD writeS, writeS,  swapBuf;
+--:-:-:Y:5      BAR.SYNC 0;
+--:-:-:-:0      IADD swapBuf, RZ,    -swapBuf;
+--:-:-:Y:5  @P1 BRA.U FILTER_LOOP;
+
+
+LOAD_FINISH:
+
+[-
+    our $trans1 = "0.244140625";
+    our $trans2 = "0.625";
+    our $trans3 = "0.390625";
+-]
+
+<INCLUDE file="xconv_winograd_4x4_3x3_32x32_common.sass"/>
diff --git a/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_X.sass b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_X.sass
new file mode 100644
index 0000000..15a0f0b
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_X.sass
@@ -0,0 +1,687 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $type;
+our $dtype       = $type eq 'h' ?         'U16' :  '32';
+our $convert_in  = $type eq 'h' ? 'F2F.F32.F16' :    '';
+our $convert_out = $type eq 'h' ? 'F2F.F16.F32' :    '';
+our $dshift      = $type eq 'h' ?           '1' :   '2';
+our $dsize       = $type eq 'h' ?           '2' :   '4';
+our $vsize       = $type eq 'h' ?          '64' : '128';
+sub dtype  { return $dtype;  }
+sub dsize  { return $dsize;  }
+sub dshift { return $dshift; }
+sub vsize  { return $vsize;  }
+-]
+
+<CONSTANT_MAPPING>
+
+    addr_zero   : 4x<32*36*2*4 + 64 + 0>
+    addr_idx_Y  : 4x<32*36*2*4 + 64 + 4>
+    addr_idx_X  : 4x<32*36*2*4 + 64 + 5>
+    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>
+
+    param_S[0]           : c[0x0][0x140]
+    param_S[1]           : c[0x0][0x144]
+    param_X[0]           : c[0x0][0x148]
+    param_X[1]           : c[0x0][0x14c]
+    param_O[0]           : c[0x0][0x150]
+    param_O[1]           : c[0x0][0x154]
+    param_I[0]           : c[0x0][0x158]
+    param_I[1]           : c[0x0][0x15c]
+    param_F[0]           : c[0x0][0x160]
+    param_F[1]           : c[0x0][0x164]
+    param_alpha          : c[0x0][0x168]
+    param_beta           : c[0x0][0x16c]
+    param_flags          : c[0x0][0x170]
+    param_C              : c[0x0][0x174]
+    param_K              : c[0x0][0x178]
+    param_N              : c[0x0][0x17c]
+    param_Xk             : c[0x0][0x180]
+    param_k              : c[0x0][0x184]
+    param_magic_Xk       : c[0x0][0x188]
+    param_shift_Xk       : c[0x0][0x18c]
+    param_magic_k        : c[0x0][0x190]
+    param_shift_k        : c[0x0][0x194]
+    param_C_1152         : c[0x0][0x198]
+    param_GXS_C_1152     : c[0x0][0x19c]
+    param_GYS_GXS_C_1152 : c[0x0][0x1a0]
+    param_P              : c[0x0][0x1a4]
+    param_Q              : c[0x0][0x1a8]
+    param_QN             : c[0x0][0x1ac]
+    param_PQN            : c[0x0][0x1b0]
+    param_PQN15          : c[0x0][0x1b4]
+    param_maskN          : c[0x0][0x1b8]
+    param_shiftX         : c[0x0][0x1bc]
+    param_shiftY         : c[0x0][0x1c0]
+    param_superX         : c[0x0][0x1c4]
+    param_superY         : c[0x0][0x1c8]
+    param_gridN          : c[0x0][0x1cc]
+    param_gridQN         : c[0x0][0x1d0]
+    param_gridPQN        : c[0x0][0x1d4]
+
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+       0-63 : czero<00-63>
+
+     3, 2,11,10 : clx<0-3>y0
+     7, 6,15,14 : clx<0-3>y1
+     1, 0, 9, 8 : clx<0-3>y2
+     5, 4,13,12 : clx<0-3>y3
+    19,18,27,26 : clx<0-3>y4
+    23,22,31,30 : clx<0-3>y5
+    17,16,25,24 : clx<0-3>y6
+    21,20,29,28 : clx<0-3>y7
+
+      32-43 : jl0Ix<0-3>, jl0Fy<0-7>
+      44-51 : jl1Ix<0-3>, jl1Fy<4-7>
+      36-39 : jl1Fy<0-3>
+
+      52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3>
+      88-89 : track<0-1>
+      90-91 ~ writeS
+
+      32-39 ~ partialC, idx_K, idx_Y, idx_X
+      40-86 ~ idx_KYXk, idx_YXk, idx_Xk, idx_k, div<1-3>, magic_YXk, negYXk, magic_Xk, negXk, tid32_2, tid1, tid31, c, offset, idx_N
+
+      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
+      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16
+
+
+     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
+     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
+     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
+    35,34,43,42,51,50,59,58 : ccx<0-7>y4
+    39,38,47,46,55,54,63,62 : ccx<0-7>y5
+    33,32,41,40,49,48,57,56 : ccx<0-7>y6
+    37,36,45,44,53,52,61,60 : ccx<0-7>y7
+
+      64-79 : jc0Ix<0-7>, jc0Fy<0-7>
+      80-91 : jc1Ix<4-7>, jc1Fy<0-7>
+      64-67 : jc1Ix<0-3>
+
+      64-86 ~ tid16, tid_1, tid128
+
+         87 = tid
+      92-95 ~ C, swapBuf, readFs, readIs
+
+      64-85 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxN, idxX, idxY, idxK, readFs2, readIs2, p, q, n, z<1-3>, mask_q
+      86-95 ~ alpha, one, writeCs, readCs, k, preds, offsetO, bias, bsum_offset
+
+      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1
+
+      // t00 80      r00 78
+      // t10 m10     r01 w01
+      // t20 m20     r02 w02
+      // t30 m30     r03 w03
+      // w00 m00     s00 w00
+      // w30 m40     s01 w01
+      // w10 m10     s02 w02
+      // w20 m20     s03 w04
+
+      78 = t0<0-5>, r<0-3>0
+      79 = temp
+
+       3, 2,11,10,19,18 : m<0-5>0
+       1, 9, 0, 8,17,16 : m<0-5>1
+      27,26,25,24,64,65 : m<0-5>2
+                2,11,10 : t10, t20, t30
+                9, 0, 8 : t11, t21, t31
+               26,25,24 : t12, t22, t32
+             3, 2,11,19 : w00, w10, w20, w30
+             1, 9, 0,17 : w01, w11, w21, w31
+            27,26,25,64 : w02, w12, w22, w32
+
+      66,67,68,69,70,71 : m<0-5>3
+      72,73,74,75,76,77 : m<0-5>4
+       8,24,10,65,16,18 : m<0-5>5
+               67,68,69 : t13, t23, t33
+               73,74,75 : t14, t24, t34
+               24,10,65 : t15, t25, t35
+            66,67,68,70 : w03, w13, w23, w33
+            72,73,74,76 : w04, w14, w24, w34
+             8,24,10,16 : w05, w15, w25, w35
+
+                1,27,66 : r01, r02, r03
+                9,26,67 : r11, r12, r13
+                0,25,68 : r21, r22, r23
+               17,64,70 : r31, r32, r33
+             3, 1,27,72 : s00, s01, s02, s03
+             2, 9,26,73 : s10, s11, s12, s13
+            11, 0,25,74 : s20, s21, s22, s23
+            19,17,64,76 : s30, s31, s32, s33
+
+                  80-83 ~ xx<0-3>
+                  78-81 ~ sum<0-3>
+                  82-83 : Sum<0-1>
+                  84-85 : Out<0-1>
+
+             8,10,16,18 ~ b0<0-3>
+            24,65,66,67 ~ b1<0-3>
+            68,69,70,71 ~ b2<0-3>
+            75,77,78,79 ~ b3<0-3>
+
+</REGISTER_MAPPING>
+
+--:-:-:-:0      MOV C,   param_C;
+--:-:1:-:1      S2R tid, SR_TID.X;
+--:-:-:-:1      MOV swapBuf, 4x<32*36*2*2>;
+01:-:-:-:0      ISETP.GE.AND P0, PT, tid, 128, PT;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+--:-:-:Y:c      LOP.AND partialC, C, 1;
+--:-:-:-:0      IADD C, C, partialC;
+--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;
+
+##############################################################
+LOAD_SETUP:
+
+--:-:1:-:1      S2R idx_YXk, SR_CTAID.X;
+--:-:2:-:1      S2R idx_K,   SR_CTAID.Y;
+
+<SCHEDULE_BLOCK>
+
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]
+
+--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;
+
+// idx_Y   = idx_YXk / blk_Xk
+--:-:-:-:1      MOV  magic_Xk, param_magic_Xk;
+--:-:-:-:1      IADD negXk, RZ, -param_Xk;
+--:-:-:-:1      ISETP.NE.AND P3, PT, magic_Xk, 1, PT;
+01:-:-:-:1  @P3 XMAD     div1, idx_YXk,    magic_Xk,    RZ;
+--:-:-:-:1  @P3 XMAD     div2, idx_YXk,    magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD     div3, idx_YXk.H1, magic_Xk.H1, RZ;
+--:-:-:-:1  @P3 XMAD.CHI div1, idx_YXk.H1, magic_Xk,    div1;
+--:-:-:-:1  @P3 IADD3.RS idx_Y, div1, div2, div3;
+--:-:-:-:1  @P3 SHR.U32  idx_Y, idx_Y,   param_shift_Xk;
+--:-:-:-:1 @!P3 SHR.U32  idx_Y, idx_YXk, param_shift_Xk;
+
+// idx_Xk  = idx_YXk % blk_Xk
+--:-:-:-:1      XMAD.LO2 idx_Xk, negXk, idx_Y, idx_YXk;
+
+// idx_X   = idx_Xk / blk_k
+// idx_k   = idx_Xk % blk_k
+--:-:-:-:1      XMAD    idx_X,  idx_Xk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32 idx_X,  idx_X,  param_shift_k;
+--:-:-:-:1      XMAD    idx_k,  idx_X,  param_k, RZ;
+--:-:-:-:1      IADD    idx_k, -idx_k,  idx_Xk;
+
+// idx_K = idx_K * blk_k + idx_k
+02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+--:-:-:-:1  @P0 STS [addr_idx_Y], idx_Y;
+--:-:-:-:1  @P0 STS [addr_idx_X], idx_X;
+--:-:-:-:1  @P0 STS [addr_idx_K], idx_K;
+
+
+--:-:-:-:1      LOP.AND  tid32_2,  tid,    -32;
+--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
+--:-:-:-:1      BFE.U32 readIs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, tid32_2;
+--:-:-:-:1      SHL     readIs, readIs, 4;
+
+// readFs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readFs, tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD3   readFs, readFs, tid1, tid32_2;
+--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;
+
+// c = (tid & 63) >> 5
+--:-:-:-:1      BFE.U32 c, tid, 0x105; // 2 bits at position 5
+
+// partialC  = (2 - partialC)
+// P6        = c < partialC
+// partialC *= 32*36 * itemsize
+--:-:-:-:1      IADD partialC, -partialC, 2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, c, partialC, PT;
+--:-:-:-:1      XMAD partialC,  partialC, 1x<32*36 * $dsize>, RZ;
+
+// writeS = (c*32*36 + (tid & 31)*4)*4
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      SHL writeS, tid31, 4;
+--:-:-:-:1      XMAD writeS, c, 4x<32*36>, writeS;
+
+// offset = c*32*36 + tid31*4
+--:-:-:-:1      SHL tid31, tid31, 2;
+--:-:-:-:1      XMAD offset, c, 1x<32*36>, tid31;
+
+
+// P5 = C > 2
+--:-:-:-:1      ISETP.GT.AND P5, PT, C, 2, PT;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P1 BRA.U FILTER_SETUP;
+
+##############################################################
+IMAGE_SETUP:
+
+--:-:1:-:1      S2R idx_N, SR_CTAID.Z;
+<SCHEDULE_BLOCK>
+// (GN,GYS,GXS,C,6,6,32)
+// offset += (idx_N*GYS*GXS*C*32*36 + idx_Y*GXS*C*32*36 + idx_X*C*32*36) * itemsize;
+--:-:-:-:1      XMAD.LO2C offset, idx_X, param_C_1152, offset;
+--:-:-:-:1      XMAD.LO2C offset, idx_Y, param_GXS_C_1152, offset;
+01:-:-:-:1      XMAD.LO2C offset, idx_N, param_GYS_GXS_C_1152, offset;
+--:-:-:-:1      LEA      track0.CC, offset, param_I[0],     [+ dshift() +];
+--:-:-:-:0      LEA.HI.X track1,    offset, param_I[1], RZ, [+ dshift() +];
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BRA.U LOAD;
+
+##############################################################
+FILTER_SETUP:
+
+<SCHEDULE_BLOCK>
+// writeS += 32*36*2*4
+--:-:-:-:1      IADD writeS, writeS, 4x<32*36*2>;
+
+// (kBlks,C,6,6,32)
+// offset += (idx_K*C*32*36) * itemsize;
+--:-:-:-:1      XMAD.LO2C offset, idx_K, param_C_1152, offset;
+--:-:-:-:1      LEA      track0.CC, offset, param_F[0],     [+ dshift() +];
+--:-:-:-:2      LEA.HI.X track1,    offset, param_F[1], RZ, [+ dshift() +];
+</SCHEDULE_BLOCK>
+
+##############################################################
+LOAD:
+
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>];
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>];
+--:-:2:-:1  @P6 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T0, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T1, [addr_zero];
+--:-:2:-:1 @!P6 LDS.U.[+ vsize() +] T2, [addr_zero];
+
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>];
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>];
+--:-:3:-:1  @P6 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T3, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T4, [addr_zero];
+--:-:3:-:1 @!P6 LDS.U.[+ vsize() +] T5, [addr_zero];
+
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>];
+--:-:-:-:1  @P6 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>];
+--:-:4:-:1  @P6 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>];
+
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T6, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.[+ vsize() +] T7, [addr_zero];
+--:-:4:-:1 @!P6 LDS.U.[+ vsize() +] T8, [addr_zero];
+
+[+
+    our $convert_in;
+    return $convert_in ? q{
+
+02:-:-:-:1      F2F.F32.F16 T03, T01.H1;
+--:-:-:-:1      F2F.F32.F16 T02, T01.H0;
+--:-:-:-:1      F2F.F32.F16 T01, T00.H1;
+--:-:2:-:1      F2F.F32.F16 T00, T00.H0;
+
+--:-:-:-:1      F2F.F32.F16 T13, T11.H1;
+--:-:-:-:1      F2F.F32.F16 T12, T11.H0;
+--:-:-:-:1      F2F.F32.F16 T11, T10.H1;
+--:-:5:-:1      F2F.F32.F16 T10, T10.H0;
+
+--:-:-:-:1      F2F.F32.F16 T23, T21.H1;
+--:-:-:-:1      F2F.F32.F16 T22, T21.H0;
+--:-:-:-:1      F2F.F32.F16 T21, T20.H1;
+--:-:6:-:1      F2F.F32.F16 T20, T20.H0;
+
+02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
+
+04:-:-:-:1      F2F.F32.F16 T33, T31.H1;
+--:-:-:-:1      F2F.F32.F16 T32, T31.H0;
+--:-:-:-:1      F2F.F32.F16 T31, T30.H1;
+--:-:3:-:1      F2F.F32.F16 T30, T30.H0;
+
+10:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
+
+--:-:-:-:1      F2F.F32.F16 T43, T41.H1;
+--:-:-:-:1      F2F.F32.F16 T42, T41.H0;
+--:-:-:-:1      F2F.F32.F16 T41, T40.H1;
+--:-:5:-:1      F2F.F32.F16 T40, T40.H0;
+
+20:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
+
+--:-:-:-:1      F2F.F32.F16 T53, T51.H1;
+--:-:-:-:1      F2F.F32.F16 T52, T51.H0;
+--:-:-:-:1      F2F.F32.F16 T51, T50.H1;
+--:-:6:-:1      F2F.F32.F16 T50, T50.H0;
+
+04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
+
+08:-:-:-:1      F2F.F32.F16 T63, T61.H1;
+--:-:-:-:1      F2F.F32.F16 T62, T61.H0;
+--:-:-:-:1      F2F.F32.F16 T61, T60.H1;
+--:-:4:-:1      F2F.F32.F16 T60, T60.H0;
+
+10:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
+
+--:-:-:-:1      F2F.F32.F16 T73, T71.H1;
+--:-:-:-:1      F2F.F32.F16 T72, T71.H0;
+--:-:-:-:1      F2F.F32.F16 T71, T70.H1;
+--:-:5:-:1      F2F.F32.F16 T70, T70.H0;
+
+20:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
+
+--:-:-:-:1      F2F.F32.F16 T83, T81.H1;
+--:-:-:-:1      F2F.F32.F16 T82, T81.H0;
+--:-:-:-:1      F2F.F32.F16 T81, T80.H1;
+--:-:6:-:1      F2F.F32.F16 T80, T80.H0;
+
+08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
+10:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
+20:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
+
+    } : q{
+02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
+--:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
+--:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
+04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
+--:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
+--:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
+08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
+--:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
+--:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
+    };
++]
+
+--:-:-:-:0      IADD   track0.CC, track0, partialC;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeS, writeS, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X track1,    track1, RZ;
+
+--:-:-:-:1      LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:1:-:1      LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T0, [track + 4x<0*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T1, [track + 4x<1*32 * $dsize>];
+--:-:2:-:1  @P5 LDG.E.[+ vsize() +] T2, [track + 4x<2*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T3, [track + 4x<3*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T4, [track + 4x<4*32 * $dsize>];
+--:-:3:-:1  @P5 LDG.E.[+ vsize() +] T5, [track + 4x<5*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T6, [track + 4x<6*32 * $dsize>];
+--:-:-:-:1  @P5 LDG.E.[+ vsize() +] T7, [track + 4x<7*32 * $dsize>];
+--:6:4:-:1  @P5 LDG.E.[+ vsize() +] T8, [track + 4x<8*32 * $dsize>];
+
+--:-:-:-:5      BRA.U LOAD_LOOP;
+
+##############################################################
+
+COMPUTE_SETUP:
+
+<SCHEDULE_BLOCK>
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+--:-:-:-:1      IADD tid128, tid, -128;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
+// readFs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
+--:-:-:-:1      SHR.U32  tid16,  tid16,   1;
+
+--:-:-:-:1      BFE.U32  readIs, tid128, 0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs, readIs, tid16;
+--:-:-:-:1      ISCADD   readIs, readIs, 4x<32*4>, 4;
+
+--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
+--:-:-:-:1      LOP.AND  readFs, tid128, 8;
+--:-:-:-:1      SHR.U32  readFs, readFs, 2;
+--:-:-:-:1      IADD3    readFs, readFs, tid16, tid_1;
+--:-:-:-:0      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2>, 4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      LDS.U.128 jc0Ix0, [readIs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Fy0, [readFs + 4x<0*32*36 + 00>];
+--:-:-:-:1      LDS.U.128 jc0Ix4, [readIs + 4x<0*32*36 + 16>];
+--:-:1:-:2      LDS.U.128 jc0Fy4, [readFs + 4x<0*32*36 + 16>];
+
+COMPUTE_LOOP:
+[+
+    my %insert = (
+
+        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;\n" .
+                 "--:-:-:-:1      IADD C, C, -2;\n",
+
+        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1      IADD readFs, readFs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD readIs, readIs, swapBuf;\n" .
+                 "--:-:-:-:1      IADD swapBuf, RZ,   -swapBuf;\n",
+
+        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
+                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        my $odd    = $j;
+        my $nOdd   = 1 - $j;
+        my $rsPred = $j == 1 ? '@P0' : '   ';
+        my $bar    = $j == 0 ? '2' : '-';
+
+        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIx4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
+        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFy0, [readFs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;
+
+        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dIx0, [readIs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;
+
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $yield  = $c % 10 == 0 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dIx%d, jc%dFy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+LOAD_LOOP:
+--:-:-:-:1      ISETP.GT.AND P0, PT, C, 2, PT;
+20:-:-:-:1      IADD track0.CC, track0, 1x<32*36*2 * $dsize>;
+--:-:-:-:1      ISETP.GT.AND P1, PT, C, 4, PT;
+--:-:-:-:1      IADD C, C, -2;
+[+
+    our ($vsize, $dsize, $convert_in);
+    my %insert = (
+
+        j0c3 => "--:-:-:-:1      IADD.X track1, track1, RZ;\n",
+
+        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Fy4, [readFs + 4x<1*32*36 + 16>];\n",
+        j0c2  => "--:-:-:-:1      LDS.U.128 jl1Ix0, [readIs + 4x<1*32*36 + 00>];\n",
+        j0c18 => "--:-:1:-:1      LDS.U.128 jl1Fy0, [readFs + 4x<1*32*36 + 00>];\n",
+
+        j1c12 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Fy4, [readFs + 4x<0*32*36 + 16>];\n",
+        j1c14 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Ix0, [readIs + 4x<0*32*36 + 00>];\n",
+        j1c16 => "--:-:1:-:1  \@P0 LDS.U.128 jl0Fy0, [readFs + 4x<0*32*36 + 00>];\n",
+
+        $convert_in ? (
+
+            j0c1  => "02:-:-:-:1      F2F.F32.F16 T03, T01.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T02, T01.H0;\n",
+            j0c4  => "--:-:-:-:1      F2F.F32.F16 T01, T00.H1;\n" .
+                     "--:-:2:-:1      F2F.F32.F16 T00, T00.H0;\n",
+
+            j0c5  => "--:-:-:-:1      F2F.F32.F16 T13, T11.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T12, T11.H0;\n",
+            j0c6  => "--:-:-:-:1      F2F.F32.F16 T11, T10.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T10, T10.H0;\n",
+
+            j0c7  => "--:-:-:-:1      F2F.F32.F16 T23, T21.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T22, T21.H0;\n",
+            j0c8  => "--:-:-:-:1      F2F.F32.F16 T21, T20.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T20, T20.H0;\n",
+
+            j0c9  => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n",
+            j0c10 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
+            j0c11 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",
+
+            j0c13 => "02:-:-:-:1  \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n",
+            j0c14 => "10:-:-:-:1  \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n",
+            j0c15 => "20:-:2:-:1  \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n",
+
+            j0c16 => "04:-:-:-:1      F2F.F32.F16 T33, T31.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T32, T31.H0;\n",
+            j0c17 => "--:-:-:-:1      F2F.F32.F16 T31, T30.H1;\n" .
+                     "--:-:3:-:1      F2F.F32.F16 T30, T30.H0;\n",
+
+            j0c19 => "--:-:-:-:1      F2F.F32.F16 T43, T41.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T42, T41.H0;\n",
+            j0c20 => "--:-:-:-:1      F2F.F32.F16 T41, T40.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T40, T40.H0;\n",
+
+            j0c21 => "--:-:-:-:1      F2F.F32.F16 T53, T51.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T52, T51.H0;\n",
+            j0c22 => "--:-:-:-:1      F2F.F32.F16 T51, T50.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T50, T50.H0;\n",
+
+            j0c23 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
+            j0c24 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
+            j0c25 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",
+
+            j0c27 => "04:-:-:-:1  \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n",
+            j0c28 => "10:-:-:-:1  \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n",
+            j0c29 => "20:-:3:-:1  \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n",
+
+            j0c30 => "08:-:-:-:1      F2F.F32.F16 T63, T61.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T62, T61.H0;\n",
+            j0c31 => "--:-:-:-:1      F2F.F32.F16 T61, T60.H1;\n" .
+                     "--:-:4:-:1      F2F.F32.F16 T60, T60.H0;\n",
+
+            j1c0  => "--:-:-:-:1      F2F.F32.F16 T73, T71.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T72, T71.H0;\n",
+            j1c1  => "--:-:-:-:1      F2F.F32.F16 T71, T70.H1;\n" .
+                     "--:-:5:-:1      F2F.F32.F16 T70, T70.H0;\n",
+
+            j1c2  => "--:-:-:-:1      F2F.F32.F16 T83, T81.H1;\n" .
+                     "--:-:-:-:1      F2F.F32.F16 T82, T81.H0;\n",
+            j1c3  => "--:-:-:-:1      F2F.F32.F16 T81, T80.H1;\n" .
+                     "--:-:6:-:1      F2F.F32.F16 T80, T80.H0;\n",
+
+            j1c4  => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
+            j1c5  => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
+            j1c6  => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",
+
+            j1c8  => "08:-:-:-:1  \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n",
+            j1c9  => "10:-:-:-:1  \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n",
+            j1c10 => "20:6:4:-:1  \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n",
+
+        ) : (
+
+            j0c6  => "02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;\n",
+            j0c8  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
+            j0c10 => "--:2:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",
+
+            j0c12 => "02:-:-:-:1  \@P1 LDG.E.$vsize T0, [track + 4x<0*32 * $dsize>];\n",
+            j0c14 => "--:-:-:-:1  \@P1 LDG.E.$vsize T1, [track + 4x<1*32 * $dsize>];\n",
+            j0c16 => "--:-:2:-:1  \@P1 LDG.E.$vsize T2, [track + 4x<2*32 * $dsize>];\n",
+
+            j0c20 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
+            j0c22 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
+            j0c24 => "--:3:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",
+
+            j0c26 => "04:-:-:-:1  \@P1 LDG.E.$vsize T3, [track + 4x<3*32 * $dsize>];\n",
+            j0c28 => "--:-:-:-:1  \@P1 LDG.E.$vsize T4, [track + 4x<4*32 * $dsize>];\n",
+            j0c30 => "--:-:3:-:1  \@P1 LDG.E.$vsize T5, [track + 4x<5*32 * $dsize>];\n",
+
+            j1c0  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
+            j1c2  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
+            j1c4  => "--:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",
+
+            j1c6  => "08:-:-:-:1  \@P1 LDG.E.$vsize T6, [track + 4x<6*32 * $dsize>];\n",
+            j1c8  => "--:-:-:-:1  \@P1 LDG.E.$vsize T7, [track + 4x<7*32 * $dsize>];\n",
+            j1c10 => "--:6:4:-:1  \@P1 LDG.E.$vsize T8, [track + 4x<8*32 * $dsize>];\n",
+        ),
+
+        j1c11 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readFs, readFs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",
+
+        j1c31 => "--:-:-:Y:5  \@P0 BRA.U LOAD_LOOP;\n",
+    );
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
+    {
+        my ($x, $y) = @$xy;
+        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+    }
+    my $out;
+    foreach my $j (0 .. 1)
+    {
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "01" : '--';
+
+            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:-:$stall";
+
+            $out .= sprintf "%s      FFMA clx%dy%d, jl%dIx%d, jl%dFy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+[-
+    our $trans1 = "0.343";
+    our $trans2 = "0.700";
+    our $trans3 = "0.490";
+-]
+
+<INCLUDE file="xconv_winograd_4x4_3x3_32x32_common.sass"/>
diff --git a/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_common.sass b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_common.sass
new file mode 100644
index 0000000..f2a06e6
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_winograd_4x4_3x3_32x32_common.sass
@@ -0,0 +1,807 @@
+
+# Copyright 2016 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+--:-:1:-:2      S2R Tid, SR_TID.X;
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha16, param_alpha;
+
+01:-:-:-:1      LOP.AND  Tid32_2,  Tid,    -32;
+--:-:-:-:1      SHR.U32  Tid32_2,  Tid32_2, 2;
+
+// readFs = ((tid & 16) >> 3) | (tid & 1)
+--:-:-:-:1      LOP.AND Tid1,   Tid,    1;
+01:-:-:-:1      LOP.AND readFs, Tid,    16;
+--:-:-:-:1      SHR.U32 readFs, readFs, 3;
+--:-:-:-:1      IADD    readFs, readFs, Tid1;
+
+// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readFs << 2)
+--:-:-:-:1      BFE.U32 readIs, Tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readIs, readIs, Tid32_2;
+--:-:-:-:1      ISCADD  readIs, readFs, readIs, 2;
+
+--:-:-:-:1      SHL readIs, readIs, 4;
+--:-:-:-:1      SHL readFs, readFs, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:1      XMAD write16Cs, readFs, 1x<32*36>, readIs;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y0, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y2, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y2, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y2, alpha16;
+--:-:-:-:4      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y1, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y3, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y3, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y3, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y4, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y6, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y6, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y6, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      FMUL shuffle16_x0y0, clx0y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y0, clx1y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y0, clx2y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x3y0, clx3y5, alpha16;
+--:-:-:-:1      FMUL shuffle16_x0y1, clx0y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x1y1, clx1y7, alpha16;
+--:-:-:-:1      FMUL shuffle16_x2y1, clx2y7, alpha16;
+--:-:-:-:0      FMUL shuffle16_x3y1, clx3y7, alpha16;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
+--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      EXIT;
+
+COMPUTE_FINISH:
+
+--:-:1:-:2      S2R tid_128, SR_TID.X;
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+
+01:-:-:-:1      IADD tid_128, tid_128, -128;
+
+--:-:-:-:1      ISETP.GE.AND P4, PT, tid_128, 256, PT;
+
+// readFs = ((tid &  8) >> 2) | (tid & 1)
+--:-:-:-:1      LOP.AND  Tid_1,   tid_128, 1;
+--:-:-:-:1      LOP.AND  readFs2, tid_128, 8;
+--:-:-:-:1      SHR.U32  readFs2, readFs2, 2;
+--:-:-:-:1      IADD     readFs2, readFs2, Tid_1;
+
+// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2)
+--:-:-:-:1      LOP.AND  tid_16,   tid_128, -16;
+--:-:-:-:1      SHR.U32  tid_16,   tid_16,   1;
+--:-:-:-:1      BFE.U32  readIs2,  tid_128,  0x201; // 2 bits at position 1
+--:-:-:-:1      LOP.OR   readIs2,  readIs2,  tid_16;
+--:-:-:-:1      ISCADD   readIs2,  readFs2, readIs2, 2;
+
+--:-:-:-:1      ISCADD   readIs2, readIs2, 4x<32*4>, 4;
+--:-:-:-:1      SHL      readFs2, readFs2, 3;
+
+// writeCs = readFs * 32*36 + readIs;
+--:-:-:-:0      XMAD writeCs, readFs2, 1x<32*36>, readIs2;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P4 BRA.U SKIP0;
+
+--:-:2:-:1      LDS idxX, [addr_idx_X];
+--:-:3:-:1      LDS idxY, [addr_idx_Y];
+--:-:1:-:1      S2R idxN,  SR_CTAID.Z;
+--:-:4:-:1      LDS idxK, [addr_idx_K];
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND tid_31, tid_128, 31;
+--:-:-:-:1      SHR.U32 tid_32, tid_128,  5;
+--:-:-:-:1      SHR.U32 tid_64, tid_128,  6;
+
+[+
+    our $bsum; return $bsum ? q{
+03:-:-:-:1      XMAD      bsum_offset, idxX, param_gridN,   idxN;
+04:-:-:-:1      XMAD.LO2C bsum_offset, idxY, param_gridQN,  bsum_offset;
+    } : '';
++]
+
+--:-:-:-:1      MOV32I one, 1.0;
+
+// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16
+--:-:-:-:1      XMAD   readCs, tid_32, 1x<32*36>, tid_31;
+--:-:-:-:1      ISCADD readCs, tid_64, readCs, 4;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// n = idxN*32 + tid & maskN
+--:-:-:-:1      LOP.AND n, tid_31, param_maskN;
+01:-:-:-:1      ISCADD  n, idxN, n, 5;
+
+// Superblock offset
+// idxX <<= shiftX
+// idxX <<= shiftY
+02:-:-:-:1      SHL idxX, idxX, param_shiftX;
+04:-:-:-:1      SHL idxY, idxY, param_shiftY;
+
+// Get this threads offset within the superblock
+--:-:-:-:1      BFE.U32 q, tid_31, param_superX;
+--:-:-:-:1      BFE.U32 p, tid_31, param_superY;
+--:-:-:-:1      ISCADD q, q, idxX, 2;
+--:-:-:-:1      ISCADD p, p, idxY, 2;
+
+// k = idxK*32 + tid_32<<1
+--:-:-:-:1      SHL tid_32, tid_32,   1;
+08:-:-:-:1      ISCADD k, idxK, tid_32, 5;
+
+// Out = k*PQN + p*QN + q*N + n
+--:-:-:-:1      XMAD      offsetO, q, param_N,    n;
+--:-:-:-:1      XMAD.LO2C offsetO, p, param_QN,   offsetO;
+--:-:-:-:1      XMAD.LO2C offsetO, k, param_PQN,  offsetO;
+
+--:-:-:-:1      IADD z1, q, 1;
+--:-:-:-:1      IADD z2, q, 2;
+--:-:-:-:1      IADD z3, q, 3;
+
+--:-:-:-:1      ISETP.EQ.AND P5, PT, RZ, param_flags, PT; // ! no-op
+--:-:-:-:1      ISETP.LT.AND P6, PT, n, param_N, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, q,  param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_Q, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_Q, P5;
+--:-:-:-:1      ISETP.GE.AND P0, PT, q,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+--:-:-:-:1      P2R mask_q, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD z1, p, 1;
+--:-:-:-:1      IADD z2, p, 2;
+--:-:-:-:1      IADD z3, p, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, p,  param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_P, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_P, P6;
+--:-:-:-:1      ISETP.GE.AND P0, PT, p,  RZ, P0;
+--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
+--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
+
+--:-:-:-:1      SEL preds, mask_q, RZ, P0;
+--:-:-:-:1  @P1 BFI preds, mask_q, 0x404, preds;
+--:-:-:-:1  @P2 BFI preds, mask_q, 0x408, preds;
+--:-:-:-:1  @P3 BFI preds, mask_q, 0x40c, preds;
+
+--:-:-:-:1      ISETP.EQ.AND P6, PT, tid_31, RZ, PT;
+</SCHEDULE_BLOCK>
+
+SKIP0:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      FMUL shuffle_x0y0, ccx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y0, alpha;
+--:-:-:-:1      FMUL shuffle_x7y0, ccx7y0, alpha;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y2, alpha;
+--:-:-:-:1      FMUL shuffle_x3y1, ccx3y2, alpha;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y2, alpha;
+--:-:-:-:1      FMUL shuffle_x7y1, ccx7y2, alpha;
+
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+</SCHEDULE_BLOCK>
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP1;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 1;
+--:-:-:-:1      IADD offsetO, offsetO, param_PQN;
+
+SKIP1:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y1, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y1, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y1, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP2;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 15;
+--:-:-:-:1      IADD offsetO, offsetO, param_PQN15;
+
+SKIP2:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y4, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y4, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y6, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP3;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+--:-:-:-:1      IADD k, k, 1;
+--:-:-:-:1      IADD offsetO, offsetO, param_PQN;
+
+SKIP3:
+
+--:-:-:-:0      FMUL shuffle_x0y0, ccx0y5, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      FMUL shuffle_x1y0, ccx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, ccx2y5, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, ccx3y5, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, ccx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, ccx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, ccx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, ccx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, ccx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, ccx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, ccx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, ccx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, ccx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, ccx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, ccx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, ccx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
+--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5  @P4 BRA.U SKIP4;
+--:-:-:-:5      CAL OUTPUT_TRANSFORM;
+SKIP4:
+
+--:-:-:-:5      EXIT;
+
+OUTPUT_TRANSFORM:
+
+<SCHEDULE_BLOCK>
+11:-:-:-:1      ISETP.LT.AND P5, PT, k, param_K, PT;
+[+
+    our $bias;
+    return $bias ? q{
+--:-:-:-:1      LEA      Sum0.CC, k, param_S[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    k, param_S[1], RZ, 2;
+
+--:-:-:-:1 @!P5 MOV bias, RZ;
+--:-:5:-:1  @P5 LDG.E.CI bias, [Sum];
+    } : '';
++]
+</SCHEDULE_BLOCK>
+
+[+
+    my $out;
+    foreach my $i (0 .. 2)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $i + 1;
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+[+
+    my $out; our ($trans1, $trans2, $trans3);
+    foreach my $i (0 .. 2)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+<ORDERED>
+$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
+--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD t2$i, m3$i, -m4$i;
+--:-:-:-:1      FADD t3$i, m3$i,  m4$i;
+--:-:-:-:1      FADD w0$i, t0$i,  m0$i;
+--:-:-:-:1      FMUL32I w3$i, t1$i, $trans1;
+--:-:-:-:1      FMUL32I w1$i, t1$i, $trans2;
+--:-:-:-:1      FMUL32I temp, t0$i, $trans3;
+--:-:-:-:1      FFMA w3$i, t2$i,  3.375, w3$i;
+--:-:-:-:1      FFMA w1$i, t2$i,  1.500, w1$i;
+--:-:-:-:1      FFMA w2$i, t3$i,  2.250, temp;
+--:-:-:-:1      FADD w0$i, w0$i,  t3$i;
+--:-:-:-:1      FADD w3$i, w3$i,  m5$i;
+</ORDERED>
+        };
+    }
+    foreach my $i (3 .. 5)
+    {
+        foreach my $j (0 .. 5)
+        {
+            my $b = $i + 1;
+            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out; our ($trans1, $trans2, $trans3);
+
+    foreach my $i (3 .. 5)
+    {
+        my $w = sprintf "%02x", 1 << $i;
+        $out .= qq{
+<ORDERED>
+$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
+--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
+--:-:-:-:1      FADD t2$i, m3$i, -m4$i;
+--:-:-:-:1      FADD t3$i, m3$i,  m4$i;
+--:-:-:-:1      FADD w0$i, t0$i,  m0$i;
+--:-:-:-:1      FMUL32I w3$i, t1$i, $trans1;
+--:-:-:-:1      FMUL32I w1$i, t1$i, $trans2;
+--:-:-:-:1      FMUL32I temp, t0$i, $trans3;
+--:-:-:-:1      FFMA w3$i, t2$i,  3.375, w3$i;
+--:-:-:-:1      FFMA w1$i, t2$i,  1.500, w1$i;
+--:-:-:-:1      FFMA w2$i, t3$i,  2.250, temp;
+--:-:-:-:1      FADD w0$i, w0$i,  t3$i;
+--:-:-:-:1      FADD w3$i, w3$i,  m5$i;
+</ORDERED>
+        };
+    }
+    return $out;
++]
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+[+
+    my $out;
+    our ($convert_out, $bias, $relu, $trans1, $trans2, $trans3);
+    foreach my $i (0 .. 3)
+    {
+        $out .= qq{
+--:-:-:-:1      FADD r${i}0, w${i}1,  w${i}2;
+--:-:-:-:1      FADD r${i}1, w${i}1, -w${i}2;
+--:-:-:-:1      FADD r${i}2, w${i}3, -w${i}4;
+--:-:-:-:1      FADD r${i}3, w${i}3,  w${i}4;
+--:-:-:-:1      FADD s${i}0, r${i}0,  w${i}0;
+--:-:-:-:1      FMUL32I s${i}3, r${i}1, $trans1;
+--:-:-:-:1      FMUL32I s${i}1, r${i}1, $trans2;
+--:-:-:-:1      FMUL32I temp,   r${i}0, $trans3;
+--:-:-:-:1      FFMA s${i}3, r${i}2,  3.375, s${i}3;
+--:-:-:-:1      FFMA s${i}1, r${i}2,  1.500, s${i}1;
+--:-:-:-:1      FFMA s${i}2, r${i}3,  2.250, temp;
+--:-:-:-:1      FADD s${i}0, s${i}0,  r${i}3;
+--:-:-:-:1      FADD s${i}3, s${i}3,  w${i}5;
+        };
+        if ($bias)
+        {
+            $out .= qq{
+10:-:-:-:1      FADD s${i}0, s${i}0, bias;
+--:-:-:-:1      FADD s${i}1, s${i}1, bias;
+--:-:-:-:1      FADD s${i}2, s${i}2, bias;
+--:-:-:-:1      FADD s${i}3, s${i}3, bias;};
+        }
+        if ($relu)
+        {
+            $out .= qq{
+--:-:-:-:1      FMNMX s${i}0, s${i}0, RZ, !PT;
+--:-:-:-:1      FMNMX s${i}1, s${i}1, RZ, !PT;
+--:-:-:-:1      FMNMX s${i}2, s${i}2, RZ, !PT;
+--:-:-:-:1      FMNMX s${i}3, s${i}3, RZ, !PT;};
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+<SCHEDULE_BLOCK>
+[+
+    our $prelu; my $out;
+    if ($prelu)
+    {
+        foreach my $i (0 .. 3)
+        {
+            $out .= qq{
+// maximum(x, 0) + beta * minimum(0, x)
+--:-:-:-:1      FMNMX b00, s${i}0, RZ, !PT;
+--:-:-:-:1      FMNMX b01, s${i}1, RZ, !PT;
+--:-:-:-:1      FMNMX b02, s${i}2, RZ, !PT;
+--:-:-:-:1      FMNMX b03, s${i}3, RZ, !PT;
+
+--:-:-:-:1      FMNMX b10, s${i}0, RZ, PT;
+--:-:-:-:1      FMNMX b11, s${i}1, RZ, PT;
+--:-:-:-:1      FMNMX b12, s${i}2, RZ, PT;
+--:-:-:-:1      FMNMX b13, s${i}3, RZ, PT;
+
+--:-:-:-:1      FFMA s${i}0, b10, param_beta, b00;
+--:-:-:-:1      FFMA s${i}1, b11, param_beta, b01;
+--:-:-:-:1      FFMA s${i}2, b12, param_beta, b02;
+--:-:-:-:1      FFMA s${i}3, b13, param_beta, b03;
+            };
+        }
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+[+
+    our ($beta, $brelu, $bprelu, $dtype, $dsize, $dshift, $convert_out, $Q, $N);
+    my $out;
+    if ($beta || $brelu || $bprelu)
+    {
+        my $preds = $beta ? q{
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+        } : '';
+
+        $out .= qq{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out0.CC, offsetO, param_X[0],     $dshift;
+--:-:-:-:1      LEA.HI.X Out1,    offsetO, param_X[1], RZ, $dshift;
+
+--:-:-:-:1  \@P0 LDG.E.CG.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>];
+--:-:-:-:1  \@P1 LDG.E.CG.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CG.$dtype b02, [Out + ${dsize}x<0*$Q*$N + 2*$N>];
+--:-:1:-:1  \@P3 LDG.E.CG.$dtype b03, [Out + ${dsize}x<0*$Q*$N + 3*$N>];
+--:-:-:-:1 \@!P0 MOV b00, RZ;
+--:-:-:-:1 \@!P1 MOV b01, RZ;
+--:-:-:-:1 \@!P2 MOV b02, RZ;
+--:-:-:-:1 \@!P3 MOV b03, RZ;
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+
+--:-:-:-:1  \@P0 LDG.E.CG.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>];
+--:-:-:-:1  \@P1 LDG.E.CG.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CG.$dtype b12, [Out + ${dsize}x<1*$Q*$N + 2*$N>];
+--:-:2:-:1  \@P3 LDG.E.CG.$dtype b13, [Out + ${dsize}x<1*$Q*$N + 3*$N>];
+--:-:-:-:1 \@!P0 MOV b10, RZ;
+--:-:-:-:1 \@!P1 MOV b11, RZ;
+--:-:-:-:1 \@!P2 MOV b12, RZ;
+--:-:-:-:1 \@!P3 MOV b13, RZ;
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+
+--:-:-:-:1  \@P0 LDG.E.CG.$dtype b20, [Out + ${dsize}x<2*$Q*$N + 0*$N>];
+--:-:-:-:1  \@P1 LDG.E.CG.$dtype b21, [Out + ${dsize}x<2*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CG.$dtype b22, [Out + ${dsize}x<2*$Q*$N + 2*$N>];
+--:-:3:-:1  \@P3 LDG.E.CG.$dtype b23, [Out + ${dsize}x<2*$Q*$N + 3*$N>];
+--:-:-:-:1 \@!P0 MOV b20, RZ;
+--:-:-:-:1 \@!P1 MOV b21, RZ;
+--:-:-:-:1 \@!P2 MOV b22, RZ;
+--:-:-:-:1 \@!P3 MOV b23, RZ;
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.L.U64 preds, preds, 12, preds;
+
+--:-:-:-:1  \@P0 LDG.E.CG.$dtype b30, [Out + ${dsize}x<3*$Q*$N + 0*$N>];
+--:-:-:-:1  \@P1 LDG.E.CG.$dtype b31, [Out + ${dsize}x<3*$Q*$N + 1*$N>];
+--:-:-:-:1  \@P2 LDG.E.CG.$dtype b32, [Out + ${dsize}x<3*$Q*$N + 2*$N>];
+--:-:4:-:1  \@P3 LDG.E.CG.$dtype b33, [Out + ${dsize}x<3*$Q*$N + 3*$N>];
+--:-:-:-:1 \@!P0 MOV b30, RZ;
+--:-:-:-:1 \@!P1 MOV b31, RZ;
+--:-:-:-:1 \@!P2 MOV b32, RZ;
+--:-:-:-:1 \@!P3 MOV b33, RZ;$preds
+</SCHEDULE_BLOCK>};
+
+        if ($convert_out)
+        {
+            $out .= q{
+01:-:-:-:1      F2F.F32.F16 b00, b00;
+--:-:-:-:1      F2F.F32.F16 b01, b01;
+--:-:-:-:1      F2F.F32.F16 b02, b02;
+--:-:1:-:1      F2F.F32.F16 b03, b03;
+02:-:-:-:1      F2F.F32.F16 b10, b10;
+--:-:-:-:1      F2F.F32.F16 b11, b11;
+--:-:-:-:1      F2F.F32.F16 b12, b12;
+--:-:2:-:1      F2F.F32.F16 b13, b13;
+04:-:-:-:1      F2F.F32.F16 b20, b20;
+--:-:-:-:1      F2F.F32.F16 b21, b21;
+--:-:-:-:1      F2F.F32.F16 b22, b22;
+--:-:3:-:1      F2F.F32.F16 b23, b23;
+08:-:-:-:1      F2F.F32.F16 b30, b30;
+--:-:-:-:1      F2F.F32.F16 b31, b31;
+--:-:-:-:1      F2F.F32.F16 b32, b32;
+--:-:4:-:1      F2F.F32.F16 b33, b33;};
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+[+
+    our $beta; return $beta ? q{
+01:-:-:-:1      FFMA s00, b00, param_beta, s00;
+--:-:-:-:1      FFMA s01, b01, param_beta, s01;
+--:-:-:-:1      FFMA s02, b02, param_beta, s02;
+--:-:-:-:1      FFMA s03, b03, param_beta, s03;
+02:-:-:-:1      FFMA s10, b10, param_beta, s10;
+--:-:-:-:1      FFMA s11, b11, param_beta, s11;
+--:-:-:-:1      FFMA s12, b12, param_beta, s12;
+--:-:-:-:1      FFMA s13, b13, param_beta, s13;
+04:-:-:-:1      FFMA s20, b20, param_beta, s20;
+--:-:-:-:1      FFMA s21, b21, param_beta, s21;
+--:-:-:-:1      FFMA s22, b22, param_beta, s22;
+--:-:-:-:1      FFMA s23, b23, param_beta, s23;
+08:-:-:-:1      FFMA s30, b30, param_beta, s30;
+--:-:-:-:1      FFMA s31, b31, param_beta, s31;
+--:-:-:-:1      FFMA s32, b32, param_beta, s32;
+--:-:-:-:1      FFMA s33, b33, param_beta, s33;} : '';
++]
+[+
+    our ($brelu, $bprelu); my $out;
+    if ($brelu || $bprelu)
+    {
+        foreach my $i (0 .. 3)
+        {
+            my $w = sprintf "%02x", 1 << $i;
+            $out .= $brelu ? qq{
+//delta *= (x > 0)
+$w:-:-:-:1      FSETP.GT.AND P0, PT, b${i}0, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b${i}1, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b${i}2, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b${i}3, RZ, PT;
+--:-:-:-:1 \@!P0 MOV s${i}0, RZ;
+--:-:-:-:1 \@!P1 MOV s${i}1, RZ;
+--:-:-:-:1 \@!P2 MOV s${i}2, RZ;
+--:-:-:-:1 \@!P3 MOV s${i}3, RZ;
+            } : qq{
+//delta *= ((x > 0) + slope * (x < 0))
+$w:-:-:-:1      FSETP.GT.AND P0, PT, b${i}0, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b${i}1, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b${i}2, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b${i}3, RZ, PT;
+--:-:-:-:1      SEL xx0, one, RZ, P0;
+--:-:-:-:1      SEL xx1, one, RZ, P1;
+--:-:-:-:1      SEL xx2, one, RZ, P2;
+--:-:-:-:1      SEL xx3, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b${i}0, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b${i}1, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b${i}2, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b${i}3, RZ, PT;
+--:-:-:-:1      SEL b${i}0, one, RZ, P0;
+--:-:-:-:1      SEL b${i}1, one, RZ, P1;
+--:-:-:-:1      SEL b${i}2, one, RZ, P2;
+--:-:-:-:1      SEL b${i}3, one, RZ, P3;
+--:-:-:-:1      FFMA b${i}0, b${i}0, param_beta, xx0;
+--:-:-:-:1      FFMA b${i}1, b${i}1, param_beta, xx1;
+--:-:-:-:1      FFMA b${i}2, b${i}2, param_beta, xx2;
+--:-:-:-:1      FFMA b${i}3, b${i}3, param_beta, xx3;
+--:-:-:-:1      FMUL s${i}0, s${i}0, b${i}0;
+--:-:-:-:1      FMUL s${i}1, s${i}1, b${i}1;
+--:-:-:-:1      FMUL s${i}2, s${i}2, b${i}2;
+--:-:-:-:1      FMUL s${i}3, s${i}3, b${i}3;
+            };
+        }
+        $out .= q{
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:5 @!P5 R2P PR, RZ, 0x0f;
+--:-:-:-:5      SHF.R.U64 preds, preds, 4, preds;
+};
+    }
+    return $out;
++]
+</SCHEDULE_BLOCK>
+[+
+    our $bsum; my $out;
+    if ($bsum)
+    {
+        $out = q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      XMAD.LO2C bias, k, param_gridPQN, bsum_offset;
+--:-:-:-:1      LEA      Sum0.CC, bias, param_S[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    bias, param_S[1], RZ, 2;
+--:-:-:-:1      MOV  sum0, RZ;
+--:-:-:-:1      MOV  sum1, RZ;
+--:-:-:-:1      MOV  sum2, RZ;
+--:-:-:-:1      MOV  sum3, RZ;};
+        foreach my $i (0 .. 3)
+        {
+            my ($dir, $amt) = $i == 2 ? ('L','12') : ('R','4');
+            $out .= qq{
+--:-:-:-:1  \@P0 FADD sum0, sum0, s${i}0;
+--:-:-:-:1  \@P1 FADD sum1, sum1, s${i}1;
+--:-:-:-:1  \@P2 FADD sum2, sum2, s${i}2;
+--:-:-:-:1  \@P3 FADD sum3, sum3, s${i}3;
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.$dir.U64 preds, preds, $amt, preds;};
+        }
+        $out .= q{
+--:-:-:-:1      FADD sum0, sum0, sum1;
+--:-:-:-:1      FADD sum2, sum2, sum3;
+--:-:-:-:1      FADD sum0, sum0, sum2;
+</SCHEDULE_BLOCK>};
+    }
+    return $out;
++]
+[+
+    our $convert_out; return $convert_out ? q{
+--:-:-:-:1      F2F.F16.F32 s00, s00;
+--:-:-:-:1      F2F.F16.F32 s01, s01;
+--:-:-:-:1      F2F.F16.F32 s02, s02;
+--:-:1:-:1      F2F.F16.F32 s03, s03;
+--:-:-:-:1      F2F.F16.F32 s10, s10;
+--:-:-:-:1      F2F.F16.F32 s11, s11;
+--:-:-:-:1      F2F.F16.F32 s12, s12;
+--:-:2:-:1      F2F.F16.F32 s13, s13;
+--:-:-:-:1      F2F.F16.F32 s20, s20;
+--:-:-:-:1      F2F.F16.F32 s21, s21;
+--:-:-:-:1      F2F.F16.F32 s22, s22;
+--:-:3:-:1      F2F.F16.F32 s23, s23;
+--:-:-:-:1      F2F.F16.F32 s30, s30;
+--:-:-:-:1      F2F.F16.F32 s31, s31;
+--:-:-:-:1      F2F.F16.F32 s32, s32;
+--:-:4:-:1      F2F.F16.F32 s33, s33;} : '';
++]
+
+[+
+    our ($bsum, $dtype, $dsize, $dshift, $Q, $N);
+    return $bsum ? qq{
+--:-:-:Y:6      LEA      Out0.CC, offsetO, param_O[0],     $dshift;
+--:-:-:-:0      LEA.HI.X Out1,    offsetO, param_O[1], RZ, $dshift;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  1, 0x1f;
+01:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03;
+--:-:-:-:2  \@P5 R2P PR, preds, 0x0f;
+--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f;
+
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:-:-:0      SHF.R.U64 preds, preds, 4, preds;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 2, 0x1f;
+
+02:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13;
+--:-:-:-:2  \@P5 R2P PR, preds, 0x0f;
+--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f;
+
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:-:-:0      SHF.R.U64 preds, preds, 4, preds;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 4, 0x1f;
+
+04:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23;
+--:-:-:-:2  \@P5 R2P PR, preds, 0x0f;
+--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f;
+
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:-:-:0      SHF.L.U64 preds, preds, 12, preds;
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 8, 0x1f;
+
+08:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32;
+--:1:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33;
+
+10:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:-:-:0      PSETP.AND.AND P5, PT, P5, P6, PT; // k < K && tid31 == 0
+--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 16, 0x1f;
+10:-:-:-:2      FADD sum0, sum1, sum0;
+--:5:-:-:1  \@P5 STG.E.CG [Sum], sum0;
+    } : qq{
+<SCHEDULE_BLOCK>
+<ORDERED>
+--:-:-:-:1      LEA      Out0.CC, offsetO, param_O[0],     $dshift;
+--:-:-:-:1      LEA.HI.X Out1,    offsetO, param_O[1], RZ, $dshift;
+
+01:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03;
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+
+02:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13;
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
+
+04:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22;
+--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23;
+
+--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
+--:-:-:-:1      SHF.L.U64 preds, preds, 12, preds;
+
+08:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30;
+--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31;
+--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32;
+--:1:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33;
+</ORDERED>
+</SCHEDULE_BLOCK>
+    };
++]
+
+--:-:-:-:5      RET;
diff --git a/Kernel/Convolution/Pascal/xconv_xprop_common.sass b/Kernel/Convolution/Pascal/xconv_xprop_common.sass
new file mode 100644
index 0000000..110dc4d
--- /dev/null
+++ b/Kernel/Convolution/Pascal/xconv_xprop_common.sass
@@ -0,0 +1,841 @@
+
+
+[-
+    # Kernel Options:
+    our ($beta, $bias, $relu, $prelu, $brelu, $bprelu, $bsum);
+
+    # set externally
+    our ($prefix, $prop, $shareI, $shareF, $stepI, $stepF, $remapI, $remapF);
+
+    our $addr_shift = $prefix eq 's' ? 2 : 1;
+    our $half = $prefix eq 'h';
+
+    sub params
+    {
+        return <<'EOF';
+    param_Sum[0]       : c[0x0][0x140]
+    param_Sum[1]       : c[0x0][0x144]
+    param_X[0]         : c[0x0][0x148]
+    param_X[1]         : c[0x0][0x14c]
+    param_O[0]         : c[0x0][0x150]
+    param_O[1]         : c[0x0][0x154]
+    param_I[0]         : c[0x0][0x158]
+    param_I[1]         : c[0x0][0x15c]
+    param_F[0]         : c[0x0][0x160]
+    param_F[1]         : c[0x0][0x164]
+    param_alpha        : c[0x0][0x168]
+    param_beta         : c[0x0][0x16c]
+    param_flags        : c[0x0][0x170]
+    param_N            : c[0x0][0x174]
+    param_K            : c[0x0][0x178]
+    param_D            : c[0x0][0x17c]
+    param_H            : c[0x0][0x180]
+    param_W            : c[0x0][0x184]
+    param_WN           : c[0x0][0x188]
+    param_HWN          : c[0x0][0x18c]
+    param_DHWN         : c[0x0][0x190]
+    param_C            : c[0x0][0x194]
+    param_KRST         : c[0x0][0x198]
+    param_RST          : c[0x0][0x19c]
+    param_RS           : c[0x0][0x1a0]
+    param_T            : c[0x0][0x1a4]
+    param_R            : c[0x0][0x1a8]
+    param_S            : c[0x0][0x1ac]
+    param_magic_RS     : c[0x0][0x1b0]
+    param_shift_RS     : c[0x0][0x1b4]
+    param_magic_S      : c[0x0][0x1b8]
+    param_shift_S      : c[0x0][0x1bc]
+    param_pad_d        : c[0x0][0x1c0]
+    param_pad_h        : c[0x0][0x1c4]
+    param_pad_w        : c[0x0][0x1c8]
+    param_str_d        : c[0x0][0x1cc]
+    param_str_h        : c[0x0][0x1d0]
+    param_str_w        : c[0x0][0x1d4]
+    param_dil_d        : c[0x0][0x1d8]
+    param_dil_h        : c[0x0][0x1dc]
+    param_dil_w        : c[0x0][0x1e0]
+    param_P2           : c[0x0][0x1e4]
+    param_Q            : c[0x0][0x1e8]
+    param_PQk          : c[0x0][0x1ec]
+    param_Qk           : c[0x0][0x1f0]
+    param_k            : c[0x0][0x1f4]
+    param_magic_PQk    : c[0x0][0x1f8]
+    param_shift_PQk    : c[0x0][0x1fc]
+    param_magic_Qk     : c[0x0][0x200]
+    param_shift_Qk     : c[0x0][0x204]
+    param_magic_k      : c[0x0][0x208]
+    param_shift_k      : c[0x0][0x20c]
+    param_QN           : c[0x0][0x210]
+    param_PQN          : c[0x0][0x214]
+    param_MPQN         : c[0x0][0x218]
+    param_gridN        : c[0x0][0x21c]
+    param_gridQN       : c[0x0][0x220]
+    param_gridPQN      : c[0x0][0x224]
+    param_gridMPQN     : c[0x0][0x228]
+    param_magic_str_d  : c[0x0][0x22c]
+    param_shift_str_d  : c[0x0][0x230]
+    param_magic_str_h  : c[0x0][0x234]
+    param_shift_str_h  : c[0x0][0x238]
+    param_magic_str_w  : c[0x0][0x23c]
+    param_shift_str_w  : c[0x0][0x240]
+EOF
+    }
+
+    sub get_mpqk
+    {
+        return <<'EOF';
+// idx_M = idx_MPQk / blk_PQk
+--:-:-:-:1      MOV  magic_PQk, param_magic_PQk;
+--:-:-:-:1      ISETP.NE.AND P1, PT,   magic_PQk, 1, PT;
+02:-:-:-:1  @P1 XMAD     div1, idx_MPQk,    magic_PQk,    RZ;
+--:-:-:-:1  @P1 XMAD     div2, idx_MPQk,    magic_PQk.H1, RZ;
+--:-:-:-:1  @P1 XMAD     div3, idx_MPQk.H1, magic_PQk.H1, RZ;
+--:-:-:-:1  @P1 XMAD.CHI div1, idx_MPQk.H1, magic_PQk,    div1;
+--:-:-:-:1  @P1 IADD3.RS idx_M, div1, div2, div3;
+--:-:-:-:1  @P1 SHR.U32  idx_M, idx_M,    param_shift_PQk;
+--:-:-:-:1 @!P1 SHR.U32  idx_M, idx_MPQk, param_shift_PQk;
+
+// idx_PQk = idx_PQk % blk_Qk
+--:-:-:-:1      IADD neg_PQk, RZ, -param_PQk;
+--:-:-:-:1      XMAD.LO2 idx_PQk, neg_PQk, idx_M, idx_MPQk;
+
+// idx_P2 = idx_PQk / blk_Qk
+--:-:-:-:1      MOV  magic_Qk, param_magic_Qk;
+--:-:-:-:1      ISETP.NE.AND P2, PT,  magic_Qk, 1, PT;
+--:-:-:-:1  @P2 XMAD     div1, idx_PQk,    magic_Qk,    RZ;
+--:-:-:-:1  @P2 XMAD     div2, idx_PQk,    magic_Qk.H1, RZ;
+--:-:-:-:1  @P2 XMAD     div3, idx_PQk.H1, magic_Qk.H1, RZ;
+--:-:-:-:1  @P2 XMAD.CHI div1, idx_PQk.H1, magic_Qk,    div1;
+--:-:-:-:1  @P2 IADD3.RS idx_P2, div1, div2, div3;
+--:-:-:-:1  @P2 SHR.U32  idx_P2, idx_P2,  param_shift_Qk;
+--:-:-:-:1 @!P2 SHR.U32  idx_P2, idx_PQk, param_shift_Qk;
+
+// idx_Qk = idx_PQk % blk_Qk
+--:-:-:-:1      IADD neg_Qk, RZ, -param_Qk;
+--:-:-:-:1      XMAD.LO2 idx_Qk, neg_Qk, idx_P2, idx_PQk;
+
+// idx_Q2  = idx_Qk / k
+--:-:-:-:1      XMAD.LO2C idx_Q2, idx_Qk, param_magic_k, RZ;
+--:-:-:-:1      SHR.U32   idx_Q2, idx_Q2, param_shift_k;
+// idx_k = idx_Qk % k
+--:-:-:-:1      IADD neg_k, RZ, -param_k;
+--:-:-:-:1      XMAD.S16.U16  idx_k, neg_k, idx_Q2, idx_Qk;
+
+// idx_K = idx_K * blk_k + idx_k
+04:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
+
+// Implement a square wave block id remapping (for all but last row (if odd number of rows))
+// idx_P = idx_P2 * 2
+// idx_Q = idx_Q2
+// if idx_P2 != gridP2:
+//     idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1)
+//     idx_Q  = idx_Q2 >> 1
+--:-:-:-:1      ISETP.NE.AND P1, PT, idx_P2, param_P2, PT;
+--:-:-:-:1      SHL idx_P, idx_P2, 1;
+--:-:-:-:1  @P1 LOP.AND q1, idx_Q2, 1;
+--:-:-:-:1  @P1 BFE.U32 q2, idx_Q2, 0x101; // 1 bit at position 1
+--:-:-:-:1  @P1 LOP.XOR q1, q1, q2;
+--:-:-:-:1  @P1 IADD idx_P, idx_P, q1;
+--:-:-:-:1  @P1 SHR.U32 idx_Q, idx_Q2, 1;
+--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2;
+
+// Scan backwards on odd rows
+// if idx_P2 & 1:
+//     idx_Q = Q - idx_Q - 1
+--:-:-:-:1      LOP.AND.NZ P2, RZ, idx_P2, 1;
+--:-:-:-:1      MOV negOne, -1;
+--:-:-:-:1  @P2 IADD3 idx_Q, -idx_Q, param_Q, negOne;
+
+EOF
+    }
+
+    sub load_zeros
+    {
+        return  "--:-:-:-:1      STS.128 [addr_zero], RZ;\n" .
+                join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+    }
+
+    sub begin_lut
+    {
+        return <<'EOF';
+--:-:-:-:5  @P0 BRA.U END_SETUP;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      STS.128 [addr_mpqk], mpqk;
+
+--:-:-:-:1      MOV rst,        tid;
+--:-:-:-:1      MOV lutStore2,  RZ;
+--:-:-:-:1      MOV lutSize,    RZ;
+--:-:-:-:1      MOV warp_count, 32;
+
+--:-:-:-:1      IADD    mask_shr, -tid, 32;
+--:-:-:-:1      SHR.U32 dep_thd_mask, negOne, mask_shr;
+EOF
+    }
+
+    sub end_lut
+    {
+        return sprintf <<'EOF', $addr_shift;
+<ORDERED>
+// Get a mask of all valid slices in the warp
+--:-:-:-:1      VOTE.ANY ballot, PT, P1;
+// Count the total valid slices
+--:-:2:-:1      POPC warp_slices, ballot;
+// Prepare lutStore for this and next loop
+--:-:-:-:1  @P1 MOV    lutStore, lutStore2;
+02:-:-:-:1      ISCADD lutStore2, warp_slices, lutStore2, 3;
+// Count all the valid slices below this threadid
+--:-:-:-:1  @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
+--:-:3:-:1  @P1 POPC dep_thd_cnt, dep_thd_bits;
+// use the rst increment to space the barrier sync
+--:-:-:-:1      IADD rst, rst, 32;
+// Update the lutStore address from this count
+04:-:-:-:1  @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
+// Store both slice offsets in the lut
+--:1:-:-:1  @P1 STS.64 [lutStore + addr_lut], sliceIF;
+</ORDERED>
+// Keep track of the total size of the lut
+--:-:-:-:1      IADD lutSize, lutSize, warp_slices;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5  @P0 BRA.U LUT_LOOP;
+
+// Share the lut size with the other warp
+--:1:-:-:2      STS [addr_szLut], lutSize;
+
+END_SETUP:
+
+01:-:-:-:5      BAR.SYNC 0;
+
+// Grab the caclulated lut size and get it's reciprical
+// Get the total reduction depth
+--:-:1:-:2      LDS lutSize, [addr_szLut];
+01:-:-:-:0      XMAD endCRST, lutSize, param_C, RZ;
+--:-:1:-:2      I2F.F32.S32 lutSizeRcp, lutSize;
+01:-:1:-:1      MUFU.RCP lutSizeRcp, lutSizeRcp;
+
+<SCHEDULE_BLOCK>
+// lutSize != 0
+--:-:-:-:1      LOP.AND.NZ P0, RZ, lutSize, -1;
+// posCRST = endCRST - tidY - 1
+--:-:-:-:1      IADD3 posCRST, endCRST, -1, -tidY;
+// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 8 then make a full 8 line fetch.
+--:-:-:-:1      LOP.AND.Z P1, partial, endCRST, 7;
+--:-:-:-:1  @P1 MOV partial, 8;
+// channel = posCRST / lutSize
+// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
+--:-:2:-:1      I2F.F32.S32 posCRSTf, posCRST;
+03:-:-:-:1      FMUL channel, posCRSTf, lutSizeRcp;
+--:-:-:-:1      FFMA channel, channel, 5.9604644775390625e-08, channel;
+--:-:2:-:1      F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+02:-:-:-:1      VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;
+--:-:-:-:1      SHL lutOffset, lutOffset, 3;
+// P1 = tidY < partial &&
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, partial, P0;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+--:-:-:-:1      XMAD.LO2C offsetIc, channel, param_DHWN, RZ;
+--:-:-:-:1      XMAD      offsetFc, channel, param_KRST, RZ;
+// posCRST -= partial
+--:-:-:-:1      IADD posCRST, posCRST, -partial;
+--:-:1:-:2  @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];
+</SCHEDULE_BLOCK>
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+// trackF = offsetFK + offsetFC + sliceF + param_F
+01:-:-:-:1  @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;
+--:-:-:-:5  @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;
+--:-:-:-:6  @P1 LEA      trackF0.CC, offsetF, param_F[0],     %1$s;
+--:-:-:-:1  @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, %1$s;
+--:-:-:-:6  @P1 LEA      trackI0.CC, offsetI, param_I[0],     %1$s;
+--:-:-:-:0  @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, %1$s;
+EOF
+    }
+
+    sub fprop_lut
+    {
+        return begin_lut() . <<'EOF' . end_lut();
+// mt = m * w - pad_d
+// pr = p * u - pad_h
+// qs = q * v - pad_w
+--:-:-:-:1      XMAD qs, q,   param_str_w, RZ;
+--:-:-:-:1      XMAD pr, p,   param_str_h, RZ;
+--:-:-:-:1      XMAD mt, m,   param_str_d, RZ;
+--:-:-:-:1      IADD qs, qs, -param_pad_w;
+--:-:-:-:1      IADD pr, pr, -param_pad_h;
+--:-:-:-:1      IADD mt, mt, -param_pad_d;
+</SCHEDULE_BLOCK>
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_count < RST
+--:-:-:-:1      ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, rst, param_RST, PT;
+
+--:-:-:-:1      IADD warp_count, warp_count, 32;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = qs + (s * dil_w)
+// y = pr + (r * dil_h)
+// z = mt + (t * dil_d)
+--:-:-:-:1      XMAD x, s, param_dil_w, qs;
+--:-:-:-:1      XMAD y, r, param_dil_h, pr;
+--:-:-:-:1      XMAD z, t, param_dil_d, mt;
+--:-:-:-:1      ISETP.GE.AND  P4, PT, x, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P5, PT, y, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P6, PT, z, RZ, P6;
+--:-:-:-:1      ISETP.LT.AND  P4, PT, x, param_W, P4;
+--:-:-:-:1      ISETP.LT.AND  P5, PT, y, param_H, P5;
+--:-:-:-:1      ISETP.LT.AND  P6, PT, z, param_D, P6;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P4, P5, P6;
+
+<ORDERED>
+// sliceI = z*HWN + y*WN + x*N
+01:-:-:-:1      XMAD      sliceI, x, param_N,   RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
+--:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, sliceI;
+// sliceF = rst * K
+--:-:-:-:1      XMAD sliceF, rst, param_K, RZ;
+</ORDERED>
+EOF
+    }
+
+    sub bprop_lut
+    {
+        return begin_lut() . <<'EOF' . end_lut();
+--:-:-:-:1      MOV str_d, param_str_d;
+--:-:-:-:1      MOV str_h, param_str_h;
+--:-:-:-:1      MOV str_w, param_str_w;
+// qs = q - pad_w
+// pr = p - pad_h
+// mt = m - pad_d
+--:-:-:-:1      IADD qs, q, -param_pad_w;
+--:-:-:-:1      IADD pr, p, -param_pad_h;
+--:-:-:-:1      IADD mt, m, -param_pad_d;
+</SCHEDULE_BLOCK>
+
+LUT_LOOP:
+
+<SCHEDULE_BLOCK>
+// warp synchronous loop while warp_count < RST
+--:-:-:-:1      ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, rst, param_RST, PT;
+--:-:-:-:1      IADD warp_count, warp_count, 32;
+// t =  rst / RS
+// rs = rst % RS
+--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
+--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
+--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
+--:-:-:-:1      IADD  rs, -rs, rst;
+// r = rs / S
+// s = rs % S
+--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
+--:-:-:-:1      SHR.U32   r, r, param_shift_S;
+--:-:-:-:1      XMAD   s, r, param_S, RZ;
+--:-:-:-:1      IADD   s, -s, rs;
+// x = qs + (s * dil_w)
+// y = pr + (r * dil_h)
+// z = mt + (t * dil_d)
+--:-:-:-:1      XMAD x, s, param_dil_w, qs;
+--:-:-:-:1      XMAD y, r, param_dil_h, pr;
+--:-:-:-:1      XMAD z, t, param_dil_d, mt;
+--:-:-:-:1      ISETP.GE.AND  P4, PT, x, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P5, PT, y, RZ, PT;
+--:-:-:-:1      ISETP.GE.AND  P6, PT, z, RZ, P6;
+// x_prime = x / str_w
+// x       = x % str_w
+--:-:-:-:1      XMAD    x_prime, x, param_magic_str_w, RZ;
+--:-:-:-:1      SHR.U32 x_prime, x_prime, param_shift_str_w;
+--:-:-:-:1      VMAD.U16.U16 x, -x_prime, str_w, x;
+// y_prime = y / str_h
+// y       = y % str_h
+--:-:-:-:1      XMAD    y_prime, y, param_magic_str_h, RZ;
+--:-:-:-:1      SHR.U32 y_prime, y_prime, param_shift_str_h;
+--:-:-:-:1      VMAD.U16.U16 y, -y_prime, str_h, y;
+// z_prime = z / str_d
+// z       = z % str_d
+--:-:-:-:1      XMAD    z_prime, z, param_magic_str_d, RZ;
+--:-:-:-:1      SHR.U32 z_prime, z_prime, param_shift_str_d;
+--:-:-:-:1      VMAD.U16.U16 z, -z_prime, str_d, z;
+
+--:-:-:-:1      ISETP.EQ.AND  P4, PT, x, RZ, P4;
+--:-:-:-:1      ISETP.EQ.AND  P5, PT, y, RZ, P5;
+--:-:-:-:1      ISETP.EQ.AND  P6, PT, z, RZ, P6;
+--:-:-:-:1      ISETP.LT.AND  P4, PT, x_prime, param_W, P4;
+--:-:-:-:1      ISETP.LT.AND  P5, PT, y_prime, param_H, P5;
+--:-:-:-:1      ISETP.LT.AND  P6, PT, z_prime, param_D, P6;
+--:-:-:-:1      PSETP.AND.AND P1, PT, P4, P5, P6;
+
+// sliceI = z_prime*HWN + y_prime*WN + x_prime*N
+01:-:-:-:1      XMAD      sliceI, x_prime, param_N,   RZ;
+--:-:-:-:1      XMAD.LO2C sliceI, y_prime, param_WN,  sliceI;
+--:-:-:-:1      XMAD.LO2C sliceI, z_prime, param_HWN, sliceI;
+// sliceF = rst_prime * K
+01:-:-:-:1      XMAD sliceF, rst, param_K, RZ;
+EOF
+    }
+
+    sub load_lut
+    {
+        return $prop eq 'f' ? fprop_lut() : bprop_lut();
+    }
+
+    sub loop_setup
+    {
+        my $swap;
+        if ($shareI == $shareF)
+        {
+            $swap = <<'EOF';
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;
+EOF
+        }
+        else
+        {
+            $swap = <<'EOF';
+--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
+--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ,     -swapBuf;
+EOF
+        }
+        return sprintf <<'EOF', $shareI, $shareF, $stepI, $stepF, $addr_shift, $swap;
+
+--:-:-:-:0      ISETP.GE.AND P1, PT, posCRST, RZ, PT;
+--:-:2:-:1      I2F.F32.S32 posCRSTf, posCRST;
+
+01:-:-:-:5      BAR.SYNC 0;
+%6$s
+
+--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*%1$-3s + 00>];
+--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*%2$-3s + 00>];
+--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*%1$-3s + %3$s>];
+--:-:1:-:2      LDS.U.128 j0Fy4, [readFs + 4x<0*%2$-3s + %4$s>];
+
+<SCHEDULE_BLOCK>
+// channel = posCRST / lutSize
+02:-:-:-:1  @P1 FMUL channel, posCRSTf, lutSizeRcp;
+--:-:-:-:1  @P1 FFMA channel, channel, 5.9604644775390625e-08, channel;
+--:-:2:-:1  @P1 F2I.S32.F32.TRUNC channel, channel;
+// lutOffset = (posCRST % lutSize) * 8
+02:-:-:-:1  @P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;
+--:-:-:-:1  @P1 SHL lutOffset, lutOffset, 3;
+// offsetIC = channel * DHWN
+// offsetFC = channel * K
+--:-:-:-:1  @P1 XMAD.LO2C offsetIc, channel, param_DHWN, RZ;
+--:-:-:-:1  @P1 XMAD      offsetFc, channel, param_KRST, RZ;
+
+--:-:-:-:1      IADD posCRST, posCRST, -8;
+--:-:2:-:2  @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];
+</SCHEDULE_BLOCK>
+
+// trackI = offsetIN + offsetIC + sliceI + param_I
+// trackF = offsetFK + offsetFC + sliceF + param_F
+02:-:-:-:1  @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;
+--:-:-:-:5  @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;
+--:-:-:-:6  @P1 LEA      trackF0.CC, offsetF, param_F[0],     %5$s;
+--:-:-:-:1  @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, %5$s;
+--:-:-:-:6  @P1 LEA      trackI0.CC, offsetI, param_I[0],     %5$s;
+--:-:-:-:0  @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, %5$s;
+EOF
+    }
+
+    sub main_loop
+    {
+        our %insert;
+        my @cOrder;
+        my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+        my @y = (0,1,4,5);
+        foreach my $x (0,2,4,6)
+        {
+            foreach my $y (@y)
+            {
+                push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+            }
+            @y = reverse @y;
+        }
+        my $out;
+        foreach my $j (0 .. 7)
+        {
+            my $odd      = $j & 1;
+            my $nOdd     = !$odd + 0;
+            my $rsOffset = ($j + 1) % 8;
+            my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+            $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareI;
+            $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareF;
+            $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareI, $stepI;
+            $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareF, $stepF;
+
+            foreach my $c (0 .. 63)
+            {
+                my ($x,$y) = @{$cOrder[$c]};
+
+                my $ins    = $insert{"j${j}c$c"} || '';
+
+                my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+                my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+                my $wait   = $c == 0 ? '01' : '--';
+
+                my $ctrl   = "$wait:-:-:$yield:$stall";
+
+                $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+            }
+        }
+        return $out;
+    }
+
+
+    sub output_setup
+    {
+        my ($tidOX, $warp_shift, $bsum_shift) = @_;
+        my $out;
+
+        $out .= qq{
+02:-:-:-:1      SHR.U32   bsum_offset, tidOX, $bsum_shift;
+04:-:-:-:1      ISCADD    bsum_offset, idx_N, bsum_offset,   $warp_shift;
+01:-:-:-:1      XMAD      bsum_offset, idx_Q, param_gridN,   bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_P, param_gridQN,  bsum_offset;
+--:-:-:-:1      XMAD.LO2C bsum_offset, idx_M, param_gridPQN, bsum_offset;
+
+--:-:-:-:1      LOP.AND.Z P5, RZ, tidOX, $tidOX;
+        } if $bsum;
+
+        $out .= qq{
+// out_offset = m*PQN + p*QN + q*N + n
+01:-:-:-:1      XMAD      out_offset, q, param_N,    n;
+--:-:-:-:1      XMAD.LO2C out_offset, p, param_QN,   out_offset;
+--:-:-:-:1      XMAD.LO2C out_offset, m, param_PQN,  out_offset;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV32I one, 1.0;
+
+--:-:-:-:1      ISETP.EQ.AND P2, PT, RZ, param_flags, PT; // no output
+--:-:-:-:1      ISETP.LT.AND P0, PT, n, param_N, P2;
+        };
+
+        $out .=  $half ? q{
+--:-:-:-:1      ISETP.LT.AND P1, PT, n, param_N, P2;
+        } : qq{
+--:-:-:-:1      IADD n, n, $stepI;
+--:-:-:-:1      ISETP.LT.AND P1, PT, n, param_N, P2;
+        };
+        return $out;
+    }
+
+    sub output
+    {
+        my $out = q{
+--:-:-:-:5      BAR.SYNC 0;
+        };
+
+        foreach my $y (0..7)
+        {
+            my $incK  = $y == 4 && !$remapF ? $stepF-3 : 1;
+            my $stepK = $y ? "\n--:-:-:-:1      IADD k, k, $incK;" : "";
+
+            $out .= qq{$stepK
+--:-:-:-:1      FMUL cs0, cx0y$y, alpha;
+--:-:-:-:1      FMUL cs1, cx1y$y, alpha;
+--:-:-:-:1      FMUL cs2, cx2y$y, alpha;
+--:-:-:-:1      FMUL cs3, cx3y$y, alpha;
+--:-:-:-:1      FMUL cs4, cx4y$y, alpha;
+--:-:-:-:1      FMUL cs5, cx5y$y, alpha;
+--:-:-:-:1      FMUL cs6, cx6y$y, alpha;
+--:-:-:-:0      FMUL cs7, cx7y$y, alpha;
+--:-:-:-:5      CAL STORE_O;
+            };
+        }
+        $out .= q{
+
+--:-:-:-:5      EXIT;
+
+STORE_O:
+
+<SCHEDULE_BLOCK>
+30:-:-:-:1      XMAD offset, k, param_MPQN, out_offset;
+--:-:-:-:1      XMAD.PSL offset, k, param_MPQN.H1, offset;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n < N
+--:-:-:-:1      ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n < N
+        };
+
+        if ($beta || $brelu || $bprelu)
+        {
+            $out .= qq{
+--:-:-:-:1      LEA      Out0.CC, offset, param_X[0],     $addr_shift;
+--:-:-:-:1      LEA.HI.X Out1,    offset, param_X[1], RZ, $addr_shift;
+            };
+            $out .= $half ? q{
+--:-:5:-:2  @P2 LDG.E.128 b0, [Out];
+            } : q{
+--:-:5:-:1  @P2 LDG.E.128 b0, [Out + 4x<00>];
+--:-:6:-:1  @P3 LDG.E.128 b4, [Out + 4x<$stepI>];
+            };
+        }
+
+        $out .= q{
+--:-:-:-:1      LEA      Sum0.CC, k, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    k, param_Sum[1], RZ, 2;
+
+--:-:6:-:1  @P2 LDG.E.CI b0, [Sum];
+--:-:-:-:1 @!P2 MOV b0, RZ;
+        } if $bias;
+
+        $out .= q{
+<ORDERED>
+--:-:-:-:1      STS.128 [writeCs + 4x<00>], cs0;
+--:-:-:-:1      STS.128 [writeCs + 4x<$remapI ? 4 : $stepI>], cs4;
+--:-:1:-:1  @P2 LDS.U.128 out0, [readCs + 4x<00>];
+--:-:2:-:1  @P3 LDS.U.128 out4, [readCs + 4x<$half ? 4 : $stepI>];
+</ORDERED>
+</SCHEDULE_BLOCK>
+<SCHEDULE_BLOCK>
+        };
+
+        $out .= q{
+21:-:-:-:1      FADD out0, out0, b0;
+--:-:-:-:1      FADD out1, out1, b0;
+--:-:-:-:1      FADD out2, out2, b0;
+--:-:-:-:1      FADD out3, out3, b0;
+02:-:-:-:1      FADD out4, out4, b0;
+--:-:-:-:1      FADD out5, out5, b0;
+--:-:-:-:1      FADD out6, out6, b0;
+--:-:-:-:1      FADD out7, out7, b0;
+        } if $bias;
+
+        $out .= q{
+01:-:-:-:1      FMNMX out0, out0, RZ, !PT;
+--:-:-:-:1      FMNMX out1, out1, RZ, !PT;
+--:-:-:-:1      FMNMX out2, out2, RZ, !PT;
+--:-:-:-:1      FMNMX out3, out3, RZ, !PT;
+02:-:-:-:1      FMNMX out4, out4, RZ, !PT;
+--:-:-:-:1      FMNMX out5, out5, RZ, !PT;
+--:-:-:-:1      FMNMX out6, out6, RZ, !PT;
+--:-:-:-:1      FMNMX out7, out7, RZ, !PT;
+        } if $relu;
+
+        $out .= q{
+// maximum(x, 0) + slope * minimum(0, x)
+01:-:-:-:1      FMNMX b0, out0, RZ, !PT;
+--:-:-:-:1      FMNMX b1, out1, RZ, !PT;
+--:-:-:-:1      FMNMX b2, out2, RZ, !PT;
+--:-:-:-:1      FMNMX b3, out3, RZ, !PT;
+02:-:-:-:1      FMNMX b4, out4, RZ, !PT;
+--:-:-:-:1      FMNMX b5, out5, RZ, !PT;
+--:-:-:-:1      FMNMX b6, out6, RZ, !PT;
+--:-:-:-:1      FMNMX b7, out7, RZ, !PT;
+
+--:-:-:-:1      FMNMX x0, out0, RZ, PT;
+--:-:-:-:1      FMNMX x1, out1, RZ, PT;
+--:-:-:-:1      FMNMX x2, out2, RZ, PT;
+--:-:-:-:1      FMNMX x3, out3, RZ, PT;
+--:-:-:-:1      FMNMX x4, out4, RZ, PT;
+--:-:-:-:1      FMNMX x5, out5, RZ, PT;
+--:-:-:-:1      FMNMX x6, out6, RZ, PT;
+--:-:-:-:1      FMNMX x7, out7, RZ, PT;
+
+--:-:-:-:1      FFMA out0, x0, param_beta, b0;
+--:-:-:-:1      FFMA out1, x1, param_beta, b1;
+--:-:-:-:1      FFMA out2, x2, param_beta, b2;
+--:-:-:-:1      FFMA out3, x3, param_beta, b3;
+--:-:-:-:1      FFMA out4, x4, param_beta, b4;
+--:-:-:-:1      FFMA out5, x5, param_beta, b5;
+--:-:-:-:1      FFMA out6, x6, param_beta, b6;
+--:-:-:-:1      FFMA out7, x7, param_beta, b7;
+        } if $prelu;
+
+        $out .= q{
+</SCHEDULE_BLOCK>
+        };
+
+        $out .= q{
+13:-:-:-:1  @P2 F2F.F32.F16 b7, b3.H1;
+--:-:-:-:1  @P2 F2F.F32.F16 b6, b3.H0;
+--:-:-:-:1  @P2 F2F.F32.F16 b5, b2.H1;
+--:-:-:-:1  @P2 F2F.F32.F16 b4, b2.H0;
+--:-:-:-:1  @P2 F2F.F32.F16 b3, b1.H1;
+--:-:-:-:1  @P2 F2F.F32.F16 b2, b1.H0;
+--:-:-:-:1  @P2 F2F.F32.F16 b1, b0.H1;
+--:-:5:-:2  @P2 F2F.F32.F16 b0, b0.H0;
+        } if $half && ($beta || $brelu || $bprelu);
+
+        $out .= q{
+<SCHEDULE_BLOCK>
+        };
+
+        $out .= q{
+11:-:-:-:1  @P2 FFMA out0, b0, param_beta, out0;
+--:-:-:-:1  @P2 FFMA out1, b1, param_beta, out1;
+--:-:-:-:1  @P2 FFMA out2, b2, param_beta, out2;
+--:-:-:-:1  @P2 FFMA out3, b3, param_beta, out3;
+22:-:-:-:1  @P3 FFMA out4, b4, param_beta, out4;
+--:-:-:-:1  @P3 FFMA out5, b5, param_beta, out5;
+--:-:-:-:1  @P3 FFMA out6, b6, param_beta, out6;
+--:-:-:-:1  @P3 FFMA out7, b7, param_beta, out7;
+        } if $beta;
+
+        $out .= q{
+//delta *= (x > 0)
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+11:-:-:-:1      FSETP.GT.AND P0, PT, b0, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b1, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b2, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b3, RZ, PT;
+--:-:-:-:1 @!P0 MOV out0, RZ;
+--:-:-:-:1 @!P1 MOV out1, RZ;
+--:-:-:-:1 @!P2 MOV out2, RZ;
+--:-:-:-:1 @!P3 MOV out3, RZ;
+22:-:-:-:1      FSETP.GT.AND P0, PT, b4, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b5, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b6, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b7, RZ, PT;
+--:-:-:-:1 @!P0 MOV out4, RZ;
+--:-:-:-:1 @!P1 MOV out5, RZ;
+--:-:-:-:1 @!P2 MOV out6, RZ;
+--:-:-:-:1 @!P3 MOV out7, RZ;
+--:-:-:-:5      R2P PR, preds, 0x0f;
+        } if $brelu;
+
+        $out .= q{
+//delta *= ((x > 0) + slope * (x < 0))
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+11:-:-:-:1      FSETP.GT.AND P0, PT, b0, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b1, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b2, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b3, RZ, PT;
+--:-:-:-:1      SEL x0, one, RZ, P0;
+--:-:-:-:1      SEL x1, one, RZ, P1;
+--:-:-:-:1      SEL x2, one, RZ, P2;
+--:-:-:-:1      SEL x3, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b0, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b1, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b2, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b3, RZ, PT;
+--:-:-:-:1      SEL b0, one, RZ, P0;
+--:-:-:-:1      SEL b1, one, RZ, P1;
+--:-:-:-:1      SEL b2, one, RZ, P2;
+--:-:-:-:1      SEL b3, one, RZ, P3;
+--:-:-:-:1      FFMA b0, b0, param_beta, x0;
+--:-:-:-:1      FFMA b1, b1, param_beta, x1;
+--:-:-:-:1      FFMA b2, b2, param_beta, x2;
+--:-:-:-:1      FFMA b3, b3, param_beta, x3;
+--:-:-:-:1      FMUL out0, out0, b0;
+--:-:-:-:1      FMUL out1, out1, b1;
+--:-:-:-:1      FMUL out2, out2, b2;
+--:-:-:-:1      FMUL out3, out3, b3;
+22:-:-:-:1      FSETP.GT.AND P0, PT, b4, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P1, PT, b5, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P2, PT, b6, RZ, PT;
+--:-:-:-:1      FSETP.GT.AND P3, PT, b7, RZ, PT;
+--:-:-:-:1      SEL x4, one, RZ, P0;
+--:-:-:-:1      SEL x5, one, RZ, P1;
+--:-:-:-:1      SEL x6, one, RZ, P2;
+--:-:-:-:1      SEL x7, one, RZ, P3;
+--:-:-:-:1      FSETP.LT.AND P0, PT, b4, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P1, PT, b5, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P2, PT, b6, RZ, PT;
+--:-:-:-:1      FSETP.LT.AND P3, PT, b7, RZ, PT;
+--:-:-:-:1      SEL b4, one, RZ, P0;
+--:-:-:-:1      SEL b5, one, RZ, P1;
+--:-:-:-:1      SEL b6, one, RZ, P2;
+--:-:-:-:1      SEL b7, one, RZ, P3;
+--:-:-:-:1      R2P PR, preds, 0x0f;
+--:-:-:-:1      FFMA b4, b4, param_beta, x4;
+--:-:-:-:1      FFMA b5, b5, param_beta, x5;
+--:-:-:-:1      FFMA b6, b6, param_beta, x6;
+--:-:-:-:1      FFMA b7, b7, param_beta, x7;
+--:-:-:-:1      FMUL out4, out4, b4;
+--:-:-:-:1      FMUL out5, out5, b5;
+--:-:-:-:1      FMUL out6, out6, b6;
+--:-:-:-:1      FMUL out7, out7, b7;
+        } if $bprelu;
+
+        $out .= q{
+--:-:-:-:1 @!P2 MOV  sum0, RZ;
+--:-:-:-:1 @!P3 MOV  sum2, RZ;
+01:-:-:-:1  @P2 FADD sum0, out0, out1;
+--:-:-:-:1  @P2 FADD sum1, out2, out3;
+02:-:-:-:1  @P3 FADD sum2, out4, out5;
+--:-:-:-:1  @P3 FADD sum3, out6, out7;
+--:-:-:-:1  @P2 FADD sum0, sum0, sum1;
+--:-:-:-:1  @P3 FADD sum2, sum2, sum3;
+--:-:-:-:1      FADD sum0, sum0, sum2;
+        } if $bsum;
+
+        $out .= q{
+<ORDERED>
+01:-:-:-:1  @P2 F2F.F16.F32 out0, out0;
+--:-:-:-:1  @P2 F2F.F16.F32 out1, out1;
+--:-:-:-:1  @P2 F2F.F16.F32 out2, out2;
+--:-:1:-:1  @P2 F2F.F16.F32 out3, out3;
+02:-:-:-:1  @P2 F2F.F16.F32 out4, out4;
+--:-:-:-:1  @P2 F2F.F16.F32 out5, out5;
+--:-:-:-:1  @P2 F2F.F16.F32 out6, out6;
+--:-:2:-:1  @P2 F2F.F16.F32 out7, out7;
+</ORDERED>
+        } if $half;
+
+        $out .= q{
+</SCHEDULE_BLOCK>
+        };
+
+        $out .= $half ? qq{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out0.CC, offset, param_O[0],     $addr_shift;
+--:-:-:-:1      LEA.HI.X Out1,    offset, param_O[1], RZ, $addr_shift;
+
+01:-:-:-:1  \@P2 BFI c0, out1, 0x1010, out0;
+--:-:-:-:1  \@P2 BFI c1, out3, 0x1010, out2;
+02:-:-:-:1  \@P2 BFI c2, out5, 0x1010, out4;
+--:-:-:-:1  \@P2 BFI c3, out7, 0x1010, out6;
+
+--:5:-:-:1  \@P2 STG.E.CG.128 [Out], c0;
+</SCHEDULE_BLOCK>
+        } : qq{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LEA      Out0.CC, offset, param_O[0],     $addr_shift;
+--:-:-:-:1      LEA.HI.X Out1,    offset, param_O[1], RZ, $addr_shift;
+
+01:-:-:-:1  \@P2 STG.E.CG.128 [Out + 4x<00>], out0;
+02:5:-:-:1  \@P3 STG.E.CG.128 [Out + 4x<$stepI>], out4;
+</SCHEDULE_BLOCK>
+        };
+
+        $out .= q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      XMAD.LO2C offset, k, param_gridMPQN, bsum_offset;
+--:-:-:-:1      LEA      Sum0.CC, offset, param_Sum[0],     2;
+--:-:-:-:1      LEA.HI.X Sum1,    offset, param_Sum[1], RZ, 2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, k, param_K, P5; // k < K && tid31 == 0
+
+--:-:2:-:2      SHFL.BFLY PT, sum1, sum0, 1,  0x1f;
+02:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:2:-:2      SHFL.BFLY PT, sum1, sum0, 2,  0x1f;
+02:-:-:-:4      FADD sum0, sum1, sum0;
+--:-:2:-:2      SHFL.BFLY PT, sum1, sum0, 4,  0x1f;
+02:-:-:-:2      FADD sum0, sum1, sum0;
+
+--:6:-:-:1  @P6 STG.E.CG [Sum], sum0;
+</SCHEDULE_BLOCK>
+        } if $bsum;
+
+        $out .= q{
+--:-:-:-:5      RET;
+        };
+    }
+
+-]
diff --git a/Kernel/SGEMM/Kepler/Makefile b/Kernel/SGEMM/Kepler/Makefile
new file mode 100644
index 0000000..9df39ec
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/Makefile
@@ -0,0 +1,19 @@
+BINS := sgemm_nn_128x128 sgemm_nt_128x128 sgemm_tn_128x128 \
+  sgemm_nn_128x128_vec sgemm_tn_128x128_vec sgemm_nt_128x128_vec
+TARGETS := $(addsuffix .cubin, $(BINS))
+TEMPLATES := $(addsuffix _template.cubin, $(BINS))
+
+all: $(BINS)
+
+$(BINS):
+	nvcc -arch sm_35 -m 64 $@.cu -cubin -O3 -o $@_template.cubin
+	KeplerAs.pl -i $@.sass $@_template.cubin $@.cubin
+
+clean:
+	rm $(TARGETS) $(TEMPLATES)
+
+.PHONY:
+	all clean
+
+#utils
+print-% : ; $(info $* is $(flavor $*) variable set to [$($*)]) @true           
diff --git a/Kernel/SGEMM/Kepler/README.md b/Kernel/SGEMM/Kepler/README.md
new file mode 100644
index 0000000..82a5a4f
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/README.md
@@ -0,0 +1,3 @@
+# KeplerGEMM
+
+Faster GEMM
diff --git a/Kernel/SGEMM/Kepler/sgemm_common_128x128.sass b/Kernel/SGEMM/Kepler/sgemm_common_128x128.sass
new file mode 100644
index 0000000..a334224
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_common_128x128.sass
@@ -0,0 +1,378 @@
+# sgemm_common_128x128
+
+////////////////////////////////////////////
+// debug
+//-:-:-:-:00 MOV tmp_param0, param_C[0];
+//-:-:-:-:00 MOV tmp_param1, param_C[1];
+//
+//-:-:-:-:00 MOV32I k, 0x3f8ccccd;
+//-:-:-:-:00 ST.E [tmp_param0], k;
+//-:-:-:-:00 EXIT;
+/////////////////////////////////////////
+
+-:-:-:-:00 LDS.128 j0Ay0, [readAs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS.128 j0Bx0, [readBs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS.128 j0Ay4, [readAs + 4x<0*128 + 64>];
+-:-:-:-:00 LDS.128 j0Bx4, [readBs + 4x<0*128 + 64>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+
+    # cOrder
+    # register resue
+
+    push  @cOrder, [0,0];
+    push  @cOrder, [0,1];
+    push  @cOrder, [1,1];
+    push  @cOrder, [2,0];
+    push  @cOrder, [1,0];
+    push  @cOrder, [2,1];
+    push  @cOrder, [2,3];
+    push  @cOrder, [2,2];
+    push  @cOrder, [1,2];
+    push  @cOrder, [0,3];
+    push  @cOrder, [1,3];
+    push  @cOrder, [0,2];
+    push  @cOrder, [0,4];
+    push  @cOrder, [0,5];
+    push  @cOrder, [1,5];
+    push  @cOrder, [2,4];
+    push  @cOrder, [1,4];
+    push  @cOrder, [2,5];
+    push  @cOrder, [2,7];
+    push  @cOrder, [2,6];
+    push  @cOrder, [1,6];
+    push  @cOrder, [0,7];
+    push  @cOrder, [1,7];
+    push  @cOrder, [0,6];
+    push  @cOrder, [3,6];
+    push  @cOrder, [3,7];
+    push  @cOrder, [4,7];
+    push  @cOrder, [5,6];
+    push  @cOrder, [4,6];
+    push  @cOrder, [5,7];
+    push  @cOrder, [5,5];
+    push  @cOrder, [5,4];
+    push  @cOrder, [4,4];
+    push  @cOrder, [3,5];
+    push  @cOrder, [4,5];
+    push  @cOrder, [3,4];
+    push  @cOrder, [3,2];
+    push  @cOrder, [3,3];
+    push  @cOrder, [4,3];
+    push  @cOrder, [5,2];
+    push  @cOrder, [4,2];
+    push  @cOrder, [5,3];
+    push  @cOrder, [5,1];
+    push  @cOrder, [5,0];
+    push  @cOrder, [4,0];
+    push  @cOrder, [3,1];
+    push  @cOrder, [4,1];
+    push  @cOrder, [3,0];
+    push  @cOrder, [6,0];
+    push  @cOrder, [7,0];
+    push  @cOrder, [7,1];
+    push  @cOrder, [6,2];
+    push  @cOrder, [6,1];
+    push  @cOrder, [7,2];
+    push  @cOrder, [7,5];
+    push  @cOrder, [6,5];
+    push  @cOrder, [6,4];
+    push  @cOrder, [7,3];
+    push  @cOrder, [7,4];
+    push  @cOrder, [6,3];
+    push  @cOrder, [6,6];
+    push  @cOrder, [6,7];
+    push  @cOrder, [7,7];
+    push  @cOrder, [7,6]; 
+
+    my $out = join '', @top;
+    my $loopc = 0;
+
+    foreach my $j (0 .. 7)
+    {
+        # $odd = 0, 1, 0, 1, 0, 1, 0, 1
+        # $nOdd = 1, 0, 1, 0, 1, 0, 1, 0
+        # $rsOffset = 1, 2, 3, 4, 5, 6, 7, 0
+        # $rsPred = ' ', ' ', ' ', ' ', ' ', ' ', ' ', @P0
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        #$insert{"j${j}c5"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        #$insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy2, [readAs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        #$insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        #$insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy6, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+        #$insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        #$insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx2, [readBs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        #$insert{"j${j}c47"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        #$insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx6, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+
+        # to avoid conflict with the second FFMA
+        # 5 11 17 59 are bank 2 friendly, two empty and two reuse
+        # 23 29 35 41 are bank 3 friendly, two empty and two reuse
+        # LDS.64 throught is higher
+        $insert{"j${j}c5"}  = sprintf "-:G:D:-:01 %s LDS.64 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c11"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx2, [readBs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c17"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c59"} = sprintf "-:G:D:-:01 %s LDS.64 j%dBx6, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c23"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c29"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy2, [readAs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c35"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c41"} = sprintf "-:G:D:-:01 %s LDS.64 j%dAy6, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins = $insert{"j${j}c$c"} || '';
+
+            my $ctrl = "-:-:-:-:00";
+
+            if ((($c - 5) % 6 == 0 || $c == 63) && !$ins) {
+              $ins = "-:G:D:-:00 NOP;\n";   
+            }
+
+            if ($c > 60 && !$ins){
+              $ins = "-:-:D:-:07 NOP;\n";
+            }
+
+            # 04 and 05 are dual issued
+            if($ins) {
+              $ctrl = "-:-:D:-:04";
+            } else {
+              if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){
+                $ctrl = "-:-:D:-:04";
+              }
+              else{
+                $ctrl = "-:-:D:-:05";
+              }
+            }
+
+            $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl, $x, $y, $odd, $x, $odd, $y, $x, $y, $ins;
+            $loopc = $loopc + 1;
+        }
+    }
+    return $out;
+
+</CODE>
+
+
+-:-:-:-:00 S2R blockA, SR_CTAID.Y;
+-:-:-:-:00 S2R blockB, SR_CTAID.Z;
+-:-:-:-:00 S2R blockZ, SR_CTAID.X;
+
+-:-:-:-:00 LOP.AND tid_31,  tid, 31;
+-:-:-:-:00 LOP.AND tid_96,  tid, 96;
+-:-:-:-:00 LOP.AND tid_128, tid, 128;
+
+// writeCs = readAs * 32 + readBs;
+-:-:-:-:00 LOP.AND readAs, readAs, 0xfff;
+-:-:-:-:00 LOP.AND readBs, readBs, 0xfff;
+-:-:-:-:00 ISCADD  writeCs, readAs, readBs, 5;
+
+// cx = tid_31 | (tid_128 >> 2);
+-:-:-:-:00 SHR.U32 cx00, tid_128, 2;
+-:-:-:-:00 LOP.OR  cx00, tid_31, cx00;
+
+// readCs = ((tid_96 << 4) | cx) << 2;
+-:-:-:-:00 SHL    readCs, tid_96,  4;
+-:-:-:-:00 LOP.OR readCs, readCs, cx00;
+-:-:-:-:00 SHL    readCs, readCs, 2;
+
+// cx += blockB*128;
+-:-:-:-:00 ISCADD cx00, blockB, cx00, 7;
+-:-:-:-:00 IADD   cx64, cx00, 64;
+
+// cy = blockA*128 + (tid_96 >> 1)
+-:-:-:-:00 SHR.U32 cy00, tid_96, 1;
+-:-:-:-:00 ISCADD  cy00, blockA, cy00, 7;
+
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+-:-:-:-:00 MOV  ldcz, RZ;
+-:-:-:-:00 MOV  ldc, param_ldc;
+-:-:-:-:00 IMAD ci, ldc, cy00, cx00;
+-:-:-:-:00 IMAD ci, ldcz, blockZ, ci;
+-:-:-:-:00 MOV tmp_param0, param_C[0];
+-:-:-:-:00 MOV tmp_param1, param_C[1];
+-:-:-:-:00 SHL  tmp_shl, ci, 2;
+-:-:-:-:00 IADD C00y0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X C00y1, RZ, tmp_param1;
+
+// ldc1 = ldc (byte)
+// ldc4 = 4ldc (byte)
+// 1dc60 = 60ldc (byte)
+-:-:-:-:00 SHL    ldc1, ldc, 2;
+-:-:-:-:00 SHL    ldc4, ldc, 4;
+-:-:-:-:00 ISCADD ldc60, ldc, -ldc4, 8;
+
+-:-:-:-:00 MOV alpha, param_alpha;
+-:-:-:-:00 MOV beta,  param_beta;
+
+// Apply beta
+-:-:-:-:00 ISETP.NE.AND P6, PT, beta, RZ, PT;
+
+// interleave for high throughput
+-:-:-:-:00 IADD   C04y0.CC, C00y0, ldc4;
+-:-:-:-:00 IADD   cy04, cy00,  4;
+-:-:-:-:00 IADD.X C04y1,    C00y1, RZ;
+-:-:-:-:00 IADD   C08y0.CC, C04y0, ldc4;
+-:-:-:-:00 IADD   cy08, cy00,  8;
+-:-:-:-:00 IADD.X C08y1,    C04y1, RZ;
+-:-:-:-:00 IADD   C12y0.CC, C08y0, ldc4;
+-:-:-:-:00 IADD   cy12, cy00,  12;
+-:-:-:-:00 IADD.X C12y1,    C08y1, RZ;
+
+-:-:-:-:00 BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+        "-:-:-:-:00 IADD   C00y0.CC, C00y0, ldc60;\n" .
+        "-:-:-:-:00 IADD   cy00,     cy00,  60;\n" .
+        "-:-:-:-:00 IADD.X C00y1,    C00y1, RZ;\n" .
+        "-:-:-:-:00 IADD   C04y0.CC, C04y0, ldc60;\n" .
+        "-:-:-:-:00 IADD   cy04,     cy04,  60;\n" .
+        "-:-:-:-:00 IADD.X C04y1,    C04y1, RZ;\n" .
+        "-:-:-:-:00 IADD   C08y0.CC, C08y0, ldc60;\n" .
+        "-:-:-:-:00 IADD   cy08,     cy08,  60;\n" .
+        "-:-:-:-:00 IADD.X C08y1,    C08y1, RZ;\n" .
+        "-:-:-:-:00 IADD   C12y0.CC, C12y0, ldc60;\n" .
+        "-:-:-:-:00 IADD   cy12,     cy12,  60;\n" .
+        "-:-:-:-:00 IADD.X C12y1,    C12y1, RZ;\n\n" if $y == 4;
+
+        $out .= sprintf(
+        "-:-:-:-:00 FMUL c0, cx0y%d, alpha;\n" .
+        "-:-:-:-:00 FMUL c1, cx1y%d, alpha;\n" .
+        "-:-:-:-:00 FMUL c2, cx2y%d, alpha;\n" .
+        "-:-:-:-:00 FMUL c3, cx3y%d, alpha;\n" .
+        "-:-:-:-:00 FMUL c4, cx4y%d, alpha;\n" .
+        "-:-:-:-:00 FMUL c5, cx5y%d, alpha;\n" .
+        "-:-:-:-:00 FMUL c6, cx6y%d, alpha;\n" .
+        "-:-:-:-:00 FMUL c7, cx7y%d, alpha;\n",
+        ($y) x 8);
+
+        $out .= "-:-:-:-:00 CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00      EXIT;
+
+STORE_C:
+
+-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, P6;
+-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P1, PT, cy00, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P2, PT, cy04, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+-:-:-:-:00 @P0 LD.E d0, [C00y + 4x<00>];
+-:-:-:-:00 @P1 LD.E d1, [C00y + 4x<64>];
+-:-:-:-:00 @P2 LD.E d2, [C04y + 4x<00>];
+-:-:-:-:00 @P3 LD.E d3, [C04y + 4x<64>];
+
+-:-:-:-:00 @!P0 MOV d0, RZ;
+-:-:-:-:00 @!P1 MOV d1, RZ;
+-:-:-:-:00 @!P2 MOV d2, RZ;
+-:-:-:-:00 @!P3 MOV d3, RZ;
+
+-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, PT;
+-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P1, PT, cy00, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P2, PT, cy04, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P3, PT, cy04, param_m, P5;
+-:-:-:-:00 IADD cy00, cy00, 1;
+-:-:-:-:00 IADD cy04, cy04, 1;
+
+// beta != 0
+-:-:-:-:00 ISETP.NE.AND P6, PT, beta, RZ, PT;
+
+-:-:-:-:00 STS.128 [writeCs+4x<00>], c0;
+-:-:-:-:00 STS.128 [writeCs+4x<64>], c4;
+-:-:-:-:00 LDS c0, [readCs + 4x<0*128 + 00>];
+-:-:-:-:00 LDS c1, [readCs + 4x<0*128 + 64>];
+-:-:-:-:00 LDS c2, [readCs + 4x<1*128 + 00>];
+-:-:-:-:00 LDS c3, [readCs + 4x<1*128 + 64>];
+
+-:-:-:-:00 @P6 FFMA c0, d0, beta, c0;
+-:-:-:-:00 @P6 FFMA c1, d1, beta, c1;
+-:-:-:-:00 @P6 FFMA c2, d2, beta, c2;
+-:-:-:-:00 @P6 FFMA c3, d3, beta, c3;
+
+-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, P6;
+-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+-:-:-:-:00 @P0 ST.E.CG [C00y0 + 4x<00>], c0;
+-:-:-:-:00 @P1 ST.E.CG [C00y0 + 4x<64>], c1;
+-:-:-:-:00 @P2 ST.E.CG [C04y0 + 4x<00>], c2;
+-:-:-:-:00 @P3 ST.E.CG [C04y0 + 4x<64>], c3;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, cy08, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P1, PT, cy08, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P2, PT, cy12, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+-:-:-:-:00 @P0 LD.E d0, [C08y0 + 4x<00>];
+-:-:-:-:00 @P1 LD.E d1, [C08y0 + 4x<64>];
+-:-:-:-:00 @P2 LD.E d2, [C12y0 + 4x<00>];
+-:-:-:-:00 @P3 LD.E d3, [C12y0 + 4x<64>];
+-:-:-:-:00 @!P0 MOV d0, RZ;
+-:-:-:-:00 @!P1 MOV d1, RZ;
+-:-:-:-:00 @!P2 MOV d2, RZ;
+-:-:-:-:00 @!P3 MOV d3, RZ;
+
+-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, PT;
+-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, cy08, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P1, PT, cy08, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P2, PT, cy12, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+-:-:-:-:00 IADD   C00y0.CC, C00y0, ldc1;
+-:-:-:-:00 IADD   cy08, cy08, 1;
+-:-:-:-:00 IADD   cy12, cy12, 1;
+-:-:-:-:00 IADD.X C00y1,    C00y1, RZ;
+-:-:-:-:00 IADD   C04y0.CC, C04y0, ldc1;
+-:-:-:-:00 IADD.X C04y1,    C04y1, RZ;
+
+-:-:-:-:00 LDS c0, [readCs + 4x<2*128 + 00>];
+-:-:-:-:00 LDS c1, [readCs + 4x<2*128 + 64>];
+-:-:-:-:00 LDS c2, [readCs + 4x<3*128 + 00>];
+-:-:-:-:00 LDS c3, [readCs + 4x<3*128 + 64>];
+
+-:-:-:-:00 @P6 FFMA c0, d0, beta, c0;
+-:-:-:-:00 @P6 FFMA c1, d1, beta, c1;
+-:-:-:-:00 @P6 FFMA c2, d2, beta, c2;
+-:-:-:-:00 @P6 FFMA c3, d3, beta, c3;
+
+-:-:-:-:00 @P0 ST.E.CG [C08y0 + 4x<00>], c0;
+-:-:-:-:00 @P1 ST.E.CG [C08y0 + 4x<64>], c1;
+-:-:-:-:00 @P2 ST.E.CG [C12y0 + 4x<00>], c2;
+-:-:-:-:00 @P3 ST.E.CG [C12y0 + 4x<64>], c3;
+
+-:-:-:-:00 IADD   C08y0.CC, C08y0, ldc1;
+-:-:-:-:00 IADD.X C08y1,    C08y1, RZ;
+-:-:-:-:00 IADD   C12y0.CC, C12y0, ldc1;
+-:-:-:-:00 IADD.X C12y1,    C12y1, RZ;
+
+-:-:-:-:00 RET;
+
diff --git a/Kernel/SGEMM/Kepler/sgemm_common_128x32.sass b/Kernel/SGEMM/Kepler/sgemm_common_128x32.sass
new file mode 100644
index 0000000..6af763c
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_common_128x32.sass
@@ -0,0 +1,220 @@
+# sgemm_common_128x32
+
+-:-:-:-:00 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>];
+-:-:-:-:00 LDS.U.128 j0Bx0, [readBs + 4x<0*32  + 00 + 0*8>];
+-:-:-:-:00 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>];
+-:-:-:-:00 LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>];
+-:-:-:-:00 LDS.U.128 j1Bx0, [readBs + 4x<1*32  + 00 + 0*8>];
+-:-:-:-:00 LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+    
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+-:-:-:-:00 MOV alpha, param_alpha;
+-:-:-:-:00 MOV beta,  param_beta;
+-:-:-:-:00 MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 32 + readBs;
+-:-:-:-:00 ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+-:-:-:-:00 IADD readBs,  readBs, -4x<szShareA>;
+-:-:-:-:00 @P0 IADD readAs,  readAs, -swapBuf;
+-:-:-:-:00 @P0 IADD readBs,  readBs, -swapBuf;
+-:-:-:-:00 ISCADD  writeCs, readAs, readBs, 3;
+
+// readCs = ((tid & 96) << 2) | (tid & 31)   << 2;
+-:-:-:-:00 LOP.AND tid31, tid, 31;
+-:-:-:-:00 LOP.AND tid96, tid, 96;
+-:-:-:-:00 ISCADD readCs, tid96, tid31, 2;
+-:-:-:-:00 SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid31;
+-:-:-:-:00 ISCADD cx, blkB, tid31, 5;
+
+// cy = blkA*128 + (tid96 >> 1)
+-:-:-:-:00 SHR.U32 cy00, tid96, 1;
+-:-:-:-:00 ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+-:-:-:-:00 MOV  ldc,  param_ldc;
+-:-:-:-:00 MOV  ldcz, param_ldcz;
+-:-:-:-:00 XMAD.LO  ci, ldc,  cy00, cx, xmad_c;
+-:-:-:-:00 XMAD.LO2 ci, ldcz, blkZ, ci;
+-:-:-:-:00 LEA      C00y0.CC, ci, param_C[0],     2;
+-:-:-:-:00 LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;
+
+// Apply relu
+-:-:-:-:00 LOP.AND.NZ   P4, RZ, flags, 2;
+// cx < n
+-:-:-:-:00 ISETP.LT.AND P6, PT, cx, param_n, PT;
+// beta != 0
+-:-:-:-:00 ISETP.NE.AND P5, PT, beta, RZ, P6; 
+
+-:-:-:-:00 SHL ldc1, ldc, 2;
+-:-:-:-:00 SHL ldc4, ldc, 4;
+-:-:-:-:00 ISCADD ldc60, ldc, -ldc4, 8;
+
+-:-:-:-:00 IADD   C04y0.CC, C00y0, ldc4;
+-:-:-:-:00 MOV d0, RZ;
+-:-:-:-:00 IADD   cy04, cy00,  4;
+-:-:-:-:00 IADD.X C04y1,    C00y1, RZ;
+-:-:-:-:00 IADD   C08y0.CC, C04y0, ldc4;
+-:-:-:-:00 MOV d1, RZ;
+-:-:-:-:00 IADD   cy08, cy00,  8;
+-:-:-:-:00 IADD.X C08y1,    C04y1, RZ;
+-:-:-:-:00 IADD   C12y0.CC, C08y0, ldc4;
+-:-:-:-:00 MOV d2, RZ;
+-:-:-:-:00 MOV d3, RZ;
+-:-:-:-:00 IADD   cy12, cy00,  12;
+-:-:-:-:00 IADD.X C12y1,    C08y1, RZ;
+
+-:-:-:-:00 BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "-:-:-:-:00 IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "-:-:-:-:00 IADD   cy00,     cy00,  60;\n" .
+            "-:-:-:-:00 IADD.X C00y1,    C00y1, RZ;\n" .
+            "-:-:-:-:00 IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "-:-:-:-:00 IADD   cy04,     cy04,  60;\n" .
+            "-:-:-:-:00 IADD.X C04y1,    C04y1, RZ;\n" .
+            "-:-:-:-:00 IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "-:-:-:-:00 IADD   cy08,     cy08,  60;\n" .
+            "-:-:-:-:00 IADD.X C08y1,    C08y1, RZ;\n" .
+            "-:-:-:-:00 IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "-:-:-:-:00 IADD   cy12,     cy12,  60;\n" .
+            "-:-:-:-:00 IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "-:-:-:-:00 FMUL c0, cx0y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c1, cx1y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c2, cx2y%d, alpha;\n" .
+            "-:-:-:-:00 FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "-:-:-:-:00 CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+-:-:-:-:00 EXIT;
+
+STORE_C:
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P1, PT, cy04, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P2, PT, cy08, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+-:-:-:-:00 @P0 LDG.E d0, [C00y];
+-:-:-:-:00 @P1 LDG.E d1, [C04y];
+-:-:-:-:00 @P2 LDG.E d2, [C08y];
+-:-:-:-:00 @P3 LDG.E d3, [C12y];
+-:-:-:-:00 @!P0 MOV d0, RZ;
+-:-:-:-:00 @!P1 MOV d1, RZ;
+-:-:-:-:00 @!P2 MOV d2, RZ;
+-:-:-:-:00 @!P3 MOV d3, RZ;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P6;
+-:-:-:-:00 ISETP.LT.AND P1, PT, cy04, param_m, P6;
+-:-:-:-:00 ISETP.LT.AND P2, PT, cy08, param_m, P6;
+-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+-:-:-:-:00 IADD cy00, cy00, 1;
+-:-:-:-:00 IADD cy04, cy04, 1;
+-:-:-:-:00 IADD cy08, cy08, 1;
+-:-:-:-:00 IADD cy12, cy12, 1;
+
+-:-:-:-:00 @P4 FMNMX c0, c0, RZ, !PT;
+-:-:-:-:00 @P4 FMNMX c1, c1, RZ, !PT;
+-:-:-:-:00 @P4 FMNMX c2, c2, RZ, !PT;
+-:-:-:-:00 @P4 FMNMX c3, c3, RZ, !PT;
+
+-:-:-:-:00 STS.128 [writeCs], c0;
+-:-:-:-:00 LDS c0, [readCs + 4x<0*32>];
+-:-:-:-:00 LDS c1, [readCs + 4x<1*32>];
+-:-:-:-:00 LDS c2, [readCs + 4x<2*32>];
+-:-:-:-:00 LDS c3, [readCs + 4x<3*32>];
+
+-:-:-:-:00 @P5 FFMA c0, d0, beta, c0;
+-:-:-:-:00 @P5 FFMA c1, d1, beta, c1;
+-:-:-:-:00 @P5 FFMA c2, d2, beta, c2;
+-:-:-:-:00 @P5 FFMA c3, d3, beta, c3;
+
+-:-:-:-:00 @P0 STG.E.CG [C00y], c0;
+-:-:-:-:00 @P1 STG.E.CG [C04y], c1;
+-:-:-:-:00 @P2 STG.E.CG [C08y], c2;
+-:-:-:-:00 @P3 STG.E.CG [C12y], c3;
+
+-:-:-:-:00 IADD   C00y0.CC, C00y0, ldc1;
+-:-:-:-:00 IADD.X C00y1,    C00y1, RZ;
+-:-:-:-:00 IADD   C04y0.CC, C04y0, ldc1;
+-:-:-:-:00 IADD.X C04y1,    C04y1, RZ;
+-:-:-:-:00 IADD   C08y0.CC, C08y0, ldc1;
+-:-:-:-:00 IADD.X C08y1,    C08y1, RZ;
+-:-:-:-:00 IADD   C12y0.CC, C12y0, ldc1;
+-:-:-:-:00 IADD.X C12y1,    C12y1, RZ;
+-:-:-:-:00 RET;
diff --git a/Kernel/SGEMM/Kepler/sgemm_nn_128x128.cu b/Kernel/SGEMM/Kepler/sgemm_nn_128x128.cu
new file mode 100644
index 0000000..7a630c8
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_nn_128x128.cu
@@ -0,0 +1,25 @@
+extern "C"
+__global__ void __launch_bounds__(256) sgemm_nn_128x128
+(
+ const float* param_A,
+ const float* param_B,
+ float*       param_C,
+ float        param_alpha,
+ float        param_beta,
+ int          param_lda,
+ int          param_ldb8,  
+ int          param_ldc,
+ int          param_m,
+ int          param_n,
+ int          param_k
+ ) {
+  __shared__ float share[128 * 8 * 4 + 32];
+
+  int tid = threadIdx.x;
+
+  share[tid] = 1;
+
+  __syncthreads();
+
+  param_C[tid] = share[255 - tid];
+}
diff --git a/Kernel/SGEMM/Kepler/sgemm_nn_128x128.sass b/Kernel/SGEMM/Kepler/sgemm_nn_128x128.sass
new file mode 100644
index 0000000..a18ae65
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_nn_128x128.sass
@@ -0,0 +1,311 @@
+# Kernel: sgemm_nn_128x128
+
+<CONSTANT_MAPPING>
+  addr_zero  : 4x<128*8*4>
+
+  gridDimA : c[0x0][0x14]
+  gridDimB : c[0x0][0x18]
+
+  param_A[0]  : c[0x0][0x140]
+  param_A[1]  : c[0x0][0x144]
+  param_B[0]  : c[0x0][0x148]
+  param_B[1]  : c[0x0][0x14c]
+  param_C[0]  : c[0x0][0x150]
+  param_C[1]  : c[0x0][0x154]
+  param_alpha : c[0x0][0x158]
+  param_beta  : c[0x0][0x15c]
+  param_lda   : c[0x0][0x160]
+  param_ldb8  : c[0x0][0x164]
+  param_ldc   : c[0x0][0x168]
+  param_m     : c[0x0][0x16c]
+  param_n     : c[0x0][0x170]
+  param_k     : c[0x0][0x174]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+  64-95 ~ blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, xmad_ta, xmad_tb, tid31, tid128
+
+  0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+
+  64-67 : j0Ay<0-3>
+  68-71 : j0Bx<0-3>
+  72-75 : j0Ay<4-7>
+  76-79 : j0Bx<4-7>
+  80-83 : j1Ay<0-3>
+  84-87 : j1Bx<0-3>
+  88-91 : j1Ay<4-7>
+  92-95 : j1Bx<4-7>
+
+  96-103 : loadA<0-3>, loadB<0-3>
+
+  104-107 : trackA<0-1>, trackB<0-1>
+
+  112-121 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop
+  122-127 ~ readAs, readBs, tid
+  128-135 ~ tmp_data, tmp_shl, tmp_param0, tmp_param1
+  144-159 ~ k<1-3>, x<1-3>
+
+  64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+  64-75 : c<0-7>, d3, d2, d1, d0
+  76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+  86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+// special to register
+// tid = 0 : 255
+// blkA = 0 : M / 128
+// blkB = 0 : N / 128
+// blkZ = 0
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkA, SR_CTAID.Y;
+-:-:-:-:00 S2R blkB, SR_CTAID.Z;
+-:-:-:-:00 S2R blkZ, SR_CTAID.X;// blkZ=1
+
+-:-:-:-:00 MOV k,    param_k;
+-:-:-:-:00 MOV ldaz, RZ;
+-:-:-:-:00 MOV ldbz, RZ;
+-:-:-:-:00 MOV ldcz, RZ;
+-:-:-:-:00 MOV lda, param_lda;
+-:-:-:-:00 MOV ldb, param_ldb8;
+-:-:-:-:00 SHR.U32 ldb, ldb, 5;// ldb is not byte
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+<CODE>
+  join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+
+// tidAY = (tid & 1) << 2
+// tidAY = 0, 4
+-:-:-:-:00 LOP.AND tid1,  tid,  1;
+-:-:-:-:00 SHL     tidAY, tid1, 2;
+
+// tidAX = tid >> 1
+// tidAX = 0 : 1 : 128
+-:-:-:-:00 SHR.U32 tidAX, tid, 1;
+
+// trackA += 4 * ((blkA*128 + tidAX) * lda + tidAY)
+//       -
+//       -
+// blkA  -
+//       -
+//       -
+// tidAX ---- trackA
+//       tidAY
+-:-:-:-:00 ISCADD txa, blkA, tidAX, 7;
+-:-:-:-:00 IMAD   ta, lda, txa, tidAY;
+-:-:-:-:00 IMAD   ta, ldaz, blkZ, ta;
+// TODO(keren): 0x2?
+-:-:-:-:00 MOV tmp_param0, param_A[0];
+-:-:-:-:00 MOV tmp_param1, param_A[1];
+-:-:-:-:00 SHL tmp_shl, ta, 0x2;
+-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5) & 7
+// tidBX = 0 : 4 : 128
+// tidBY = 0 : 1 : 8
+-:-:-:-:00 LOP.AND tid31, tid, 31;
+-:-:-:-:00 SHL     tidBX, tid31, 2;
+-:-:-:-:00 BFE.U32 tidBY, tid, 0x305; // 3 bits at position 5
+
+// trackB += (blkB*128 + ldb*tidBY + tidBX) * 4
+//       -
+//       -
+//       -
+//       -
+// tidBY --------------- trackB
+//            blkB    tidBX
+-:-:-:-:00 ISCADD txb, blkB, tidBX, 7;
+-:-:-:-:00 IMAD tb, ldb, tidBY, txb;
+-:-:-:-:00 IMAD tb, ldbz, blkZ, tb;
+// TODO(keren): 0x2?
+-:-:-:-:00 MOV tmp_param0, param_B[0];
+-:-:-:-:00 MOV tmp_param1, param_B[1];
+-:-:-:-:00 SHL tmp_shl, tb, 0x2;
+-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1;
+
+// TODO(keren): blkB * 128 + tidBX < param_n
+-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeAs = 4 * (128 * tidAY + tidAX + 128 * 8 * 2)
+// tidAX = 0 : 1 : 128
+// tidAY = 0, 4
+// ----------------
+// ---------------- tidAY 0, 4
+// ----------------
+// ----------------
+// ---- writeAs
+// tidAX
+-:-:-:-:00 ISCADD writeAs, tidAY, tidAX, 7;
+-:-:-:-:00 ISCADD writeAs, writeAs, 4x<128*8*2>, 2;
+
+// writeBs = (128*tidBY + tidBX + 128 * 8 * 3) * 4
+// tidBX = 0 : 4 : 128
+// tidBY = 0 : 1 : 8
+// ----------------
+// ----------------
+// ---------------- 
+// ---------------- tidBY
+// ---- writeBs
+// tidBX 
+-:-:-:-:00 ISCADD writeBs, tidBY, tidBX, 7;
+-:-:-:-:00 ISCADD writeBs, writeBs, 4x<128*8*3>, 2;
+
+// (keren): A allocate 128 * 8 elements
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+// [6][5][4][0] * 4
+// readAs = 0 : 1 : 64
+-:-:-:-:00 LOP.AND readAs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readAs, readAs, 3;
+-:-:-:-:00 LOP.OR  readAs, readAs, tid1;
+-:-:-:-:00 SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+// [7][3][2][1] * 4 * 4 + 4 * 128 * 8
+// readBs = 0 : 1 : 64
+-:-:-:-:00 LOP.AND tid128, tid,    128;
+-:-:-:-:00 BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 SHR.U32 readBs, tid128, 4;
+-:-:-:-:00 LOP.OR  readBs, readBs, tid7;
+-:-:-:-:00 ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+
+  REMAINDER:
+
+<CODE>
+    return q{
+      // doLoad0 = tidBY < k
+      -:-:-:-:00 IADD x1, txb, 1;
+      -:-:-:-:00 IADD x2, txb, 2;
+      -:-:-:-:00 IADD x3, txb, 3;
+
+      -:-:-:-:00 ISETP.LT.AND P0, PT, tidBY, k, P6;
+      -:-:-:-:00 ISETP.LT.AND P1, PT, x1, param_n, P0;
+      -:-:-:-:00 ISETP.LT.AND P2, PT, x2, param_n, P0;
+      -:-:-:-:00 ISETP.LT.AND P3, PT, x3, param_n, P0;
+
+      -:-:-:-:00 @P0 LD.E.CI loadB0, [trackB + 4x<0>];
+      -:-:-:-:00 @P1 LD.E.CI loadB1, [trackB + 4x<1>];
+      -:-:-:-:00 @P2 LD.E.CI loadB2, [trackB + 4x<2>];
+      -:-:-:-:00 @P3 LD.E.CI loadB3, [trackB + 4x<3>];
+
+      -:-:-:-:00 @!P0 MOV loadB0, RZ;
+      -:-:-:-:00 @!P1 MOV loadB1, RZ;
+      -:-:-:-:00 @!P2 MOV loadB2, RZ;
+      -:-:-:-:00 @!P3 MOV loadB3, RZ;
+
+      -:-:-:-:00 IADD k1, tidAY, 1;
+      -:-:-:-:00 IADD k2, tidAY, 2;
+      -:-:-:-:00 IADD k3, tidAY, 3;
+
+      -:-:-:-:00 ISETP.LT.AND P0, PT, tidAY, k, P5;
+      -:-:-:-:00 ISETP.LT.AND P1, PT, k1, k, P5;
+      -:-:-:-:00 ISETP.LT.AND P2, PT, k2, k, P5;
+      -:-:-:-:00 ISETP.LT.AND P3, PT, k3, k, P5;
+
+      -:-:-:-:00 @P0 LD.E.CI loadA0, [trackA + 4x<0>];
+      -:-:-:-:00 @P1 LD.E.CI loadA1, [trackA + 4x<1>];
+      -:-:-:-:00 @P2 LD.E.CI loadA2, [trackA + 4x<2>];
+      -:-:-:-:00 @P3 LD.E.CI loadA3, [trackA + 4x<3>];
+
+      -:-:-:-:00 @!P0 MOV loadA0, RZ;
+      -:-:-:-:00 @!P1 MOV loadA1, RZ;
+      -:-:-:-:00 @!P2 MOV loadA2, RZ;
+      -:-:-:-:00 @!P3 MOV loadA3, RZ;
+
+      // bDoRemainder = k > 8
+      -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, PT;
+
+      -:G:-:-:15 STS.128 [writeBs], loadB0;
+
+      -:G:-:-:15 STS [writeAs + 4x<0*128>], loadA0;
+      -:G:-:-:15 STS [writeAs + 4x<1*128>], loadA1;
+      -:G:-:-:15 STS [writeAs + 4x<2*128>], loadA2;
+      -:G:-:-:15 STS [writeAs + 4x<3*128>], loadA3;
+
+      -:-:-:-:00 IADD   trackB0.CC, trackB0, param_ldb8;
+      -:-:-:-:00 IADD.X trackB1, trackB1, RZ;
+
+      -:-:-:-:00 IADD   trackA0.CC, trackA0, 4x<8>;
+      -:-:-:-:00 IADD.X trackA1, trackA1, RZ;
+    };
+</CODE>
+
+
+// double buffer
+// readAs = readAs + 128 * 8 * 2
+// readBs = readBs + 128 * 8 * 2
+-:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>;
+-:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>;
+-:-:-:-:00 BAR.SYNC 0;
+// writeAs = writeAs - 128 * 8 * 2
+// writeBs = writeBs - 128 * 8 * 2
+-:-:-:-:00 LOP.XOR writeAs, writeAs, 4x<128*8*2>;
+-:-:-:-:00 LOP.XOR writeBs, writeBs, 4x<128*8*2>;
+
+<CODE>
+    my $k_end = 24;
+    our %insert =
+    (
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+        j0c53 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j0c63 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n",
+
+        # cannot use LDG because of the offset
+        j1c47 => "-:-:-:-:00 \@P3 LD.E.CI loadB0, [trackB + 4x<0>];\n",
+        j1c53 => "-:-:-:-:00 \@P3 LD.E.CI loadB1, [trackB + 4x<1>];\n",
+        j1c61 => "-:-:-:-:00 \@P3 LD.E.CI loadB2, [trackB + 4x<2>];\n",
+        j1c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB3, [trackB + 4x<3>];\n",
+
+        j2c47 => "-:-:-:-:00 \@P2 LD.E.CI loadA0, [trackA + 4x<0>];\n",
+        j2c53 => "-:-:-:-:00 \@P2 LD.E.CI loadA1, [trackA + 4x<1>];\n",
+        j2c61 => "-:-:-:-:00 \@P2 LD.E.CI loadA2, [trackA + 4x<2>];\n",
+        j2c62 => "-:-:-:-:00 \@P2 LD.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j3c47 => "-:-:D:S:02 \@P0 STS.128 [writeBs], loadB0;\n",
+        j3c53 => "-:-:-:-:00 IADD32I k, k, -8;\n",
+
+        j4c47 => "-:-:-:-:00 \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j4c53 => "-:-:-:-:00 \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j5c47 => "-:-:D:S:02 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n",
+        j5c53 => "-:-:D:S:02 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j5c61 => "-:-:D:-:07 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j5c62 => "-:-:D:-:07 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n",
+
+        j6c47 => "-:-:-:-:00 \@P2 IADD   trackA0.CC, trackA0, 4x<8>;\n",
+        j6c53 => "-:-:-:-:00 \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n",
+        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n",
+        j6c63 => "T:-:D:S:00 BAR.SYNC 0;\n",
+
+        j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n",
+        j7c53 => "-:-:-:-:00 \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n",
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n" .
+                 "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
+
diff --git a/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.cu b/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.cu
new file mode 100644
index 0000000..3e262ba
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.cu
@@ -0,0 +1,25 @@
+extern "C"
+__global__ void __launch_bounds__(256) sgemm_nn_128x128_vec
+(
+ const float* param_A,
+ const float* param_B,
+ float*       param_C,
+ float        param_alpha,
+ float        param_beta,
+ int          param_lda,
+ int          param_ldb8,  
+ int          param_ldc,
+ int          param_m,
+ int          param_n,
+ int          param_k
+ ) {
+  __shared__ float share[128 * 8 * 4 + 32];
+
+  int tid = threadIdx.x;
+
+  share[tid] = 1;
+
+  __syncthreads();
+
+  param_C[tid] = share[255 - tid];
+}
diff --git a/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.sass b/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.sass
new file mode 100644
index 0000000..723dadd
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.sass
@@ -0,0 +1,260 @@
+# Kernel: sgemm_nn_128x128_vec
+
+<CONSTANT_MAPPING>
+  addr_zero  : 4x<128*8*4>
+
+  gridDimA : c[0x0][0x14]
+  gridDimB : c[0x0][0x18]
+
+  param_A[0]  : c[0x0][0x140]
+  param_A[1]  : c[0x0][0x144]
+  param_B[0]  : c[0x0][0x148]
+  param_B[1]  : c[0x0][0x14c]
+  param_C[0]  : c[0x0][0x150]
+  param_C[1]  : c[0x0][0x154]
+  param_alpha : c[0x0][0x158]
+  param_beta  : c[0x0][0x15c]
+  param_lda   : c[0x0][0x160]
+  param_ldb8  : c[0x0][0x164]
+  param_ldc   : c[0x0][0x168]
+  param_m     : c[0x0][0x16c]
+  param_n     : c[0x0][0x170]
+  param_k     : c[0x0][0x174]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+  64-91 ~ blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, txa, txb, tid31, tid128, tidBY, ta, tb, tmp_shl
+  92-93 ~ tmp_param0, tmp_param1
+
+  0-63 : czero<00-63>
+
+  // avoid ffma single instruction bank conflict
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+  64-67 : j0Ay<0-3>
+  68-71 : j0Bx<0-3>
+  72-75 : j0Ay<4-7>
+  76-79 : j0Bx<4-7>
+  80-83 : j1Ay<0-3>
+  84-87 : j1Bx<0-3>
+  88-91 : j1Ay<4-7>
+  92-95 : j1Bx<4-7>
+
+  96-103 : loadA<0-3>, loadB<0-3>
+
+  104-107 : trackA<0-1>, trackB<0-1>
+
+  108-112 ~ writeAs, writeBs, k, k_and, tidAY
+  // to avoid lds bank conflict with ffma 
+  117 ~ readAs
+  116 ~ readBs
+  115 ~ tid
+
+  64-75 ~ ldc, ci, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+  64-75 : c<0-7>, d3, d2, d1, d0
+  76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+  86-101 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+//special to register
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkA, SR_CTAID.Y;
+-:-:-:-:00 S2R blkB, SR_CTAID.Z;
+-:-:-:-:00 S2R blkZ, SR_CTAID.X;//blkZ=1
+
+-:-:-:-:00 MOV k,   param_k;
+-:-:-:-:00 MOV ldaz, RZ;
+-:-:-:-:00 MOV ldbz, RZ;
+-:-:-:-:00 MOV ldcz, RZ;
+-:-:-:-:00 MOV lda, param_lda;
+-:-:-:-:00 MOV ldb, param_ldb8;
+-:-:-:-:00 SHR.U32 ldb, ldb, 5;//ldb is not byte
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+<CODE>
+  join('', map sprintf("-:-:-:-:00 LDS.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+
+// tidAY  = (tid & 1) << 2
+-:-:-:-:00 LOP.AND tid1,  tid,  1;
+-:-:-:-:00 SHL     tidAY, tid1, 2;
+
+// tidAX = tid >> 1
+-:-:-:-:00 SHR.U32 tidAX, tid, 1;
+
+// trackA += 4 * ((blkA*128 + tidAX) * lda + tidAY)
+-:-:-:-:00 ISCADD txa, blkA, tidAX, 7;
+-:-:-:-:00 IMAD   ta, lda, txa, tidAY;
+-:-:-:-:00 IMAD   ta, ldaz, blkZ, ta;
+// TODO(keren): 0x2?
+-:-:-:-:00 MOV tmp_param0, param_A[0];
+-:-:-:-:00 MOV tmp_param1, param_A[1];
+-:-:-:-:00 SHL tmp_shl, ta, 0x2;
+-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5) & 7
+-:-:-:-:00 LOP.AND tid31, tid, 31;
+-:-:-:-:00 SHL     tidBX, tid31, 2;
+-:-:-:-:00 BFE.U32 tidBY, tid, 0x305; // 3 bits at position 5
+
+// trackB += (blkB*128 + ldb*tidBY + tidBX) * 4
+-:-:-:-:00 ISCADD txb, blkB, tidBX, 7;
+-:-:-:-:00 IMAD tb, ldb, tidBY, txb;
+-:-:-:-:00 IMAD tb, ldbz, blkZ, tb;
+// TODO(keren): 0x2?
+-:-:-:-:00 MOV tmp_param0, param_B[0];
+-:-:-:-:00 MOV tmp_param1, param_B[1];
+-:-:-:-:00 SHL tmp_shl, tb, 0x2;
+-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1;
+
+// TODO(keren): blkB * 128 + tidBX < param_n
+-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeAs = 4 * (128 * tidAY + tidAX)
+-:-:-:-:00 ISCADD writeAs, tidAY, tidAX, 7;
+-:-:-:-:00 ISCADD writeAs, writeAs, 4x<128*8*2>, 2;
+
+// writeBs = (128*tidBY + tidBX) * 4
+-:-:-:-:00 ISCADD writeBs, tidBY, tidBX, 7;
+-:-:-:-:00 ISCADD writeBs, writeBs, 4x<128*8*3>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+-:-:-:-:00 LOP.AND readAs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readAs, readAs, 3;
+-:-:-:-:00 LOP.OR  readAs, readAs, tid1;
+-:-:-:-:00 SHL     readAs, readAs, 4;
+
+// (keren): A allocate 128 * 8 elements
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+-:-:-:-:00 LOP.AND tid128, tid,    128;
+-:-:-:-:00 BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 SHR.U32 readBs, tid128, 4;
+-:-:-:-:00 LOP.OR  readBs, readBs, tid7;
+-:-:-:-:00 ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+  REMAINDER:
+
+<CODE>
+    return q{
+      // k must be a multiple of 4
+      // n must be a multiple of 4
+      //       -
+      //       -
+      //       -
+      //       -
+      // tidBY --------------- trackB ---- loadB0
+      //            blkB    tidBX
+      -:-:-:-:00 @P6 LD.E.CI.128 loadB0, [trackB];
+
+      //       -
+      //       -
+      // blkA  -
+      //       -
+      //       -
+      // tidAX ---- trackA ---- loadA0 -------- loadA4
+
+      // load if tidAY < k (tidAY == 0 if mod 4 not mod 8)
+      -:-:-:-:00 ISETP.LT.AND P5, PT, tidAY, k, P5;
+      -:-:-:-:00 @P5 LD.E.CI.128 loadA0, [trackA];
+
+      // bDoRemainder = k & 7 && k > 8
+      -:-:-:-:00 LOP.AND k_and, k, 7;
+      -:-:-:-:00 ISETP.EQ.AND P1, PT, k_and, RZ, PT;
+      -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, !P1;
+
+      -:-:-:-:00 @!P6 LDS.128 loadB0, [RZ + addr_zero];
+      -:-:-:-:00 @!P5 LDS.128 loadA0, [RZ + addr_zero];
+
+      // ----------------------
+      // ---------------------- 
+      // ---------------------- tidBY
+      // ----- writeBS ---- loadB0
+      // tidBX
+
+      -:-:-:-:00 STS.128 [writeBs], loadB0;
+
+      // ------------------
+      // ------------------ tidAY 0, 4
+      // ------------------
+      // ------ writeAS - loadA0
+      // ---------------- loadA1
+      // ---------------- loadA2
+      // ---------------- loadA3
+      // tidAX
+      -:-:-:-:00 STS [writeAs + 4x<0*128>], loadA0;
+      -:-:-:-:00 STS [writeAs + 4x<1*128>], loadA1;
+      -:-:-:-:00 STS [writeAs + 4x<2*128>], loadA2;
+      -:-:-:-:00 STS [writeAs + 4x<3*128>], loadA3;
+
+      -:-:-:-:00 IADD   trackB0.CC, trackB0, param_ldb8;
+      -:-:-:-:00 IADD.X trackB1, trackB1, RZ;
+
+      -:-:-:-:00 IADD   trackA0.CC, trackA0, 4x<8>;
+      -:-:-:-:00 IADD.X trackA1, trackA1, RZ;
+    };
+</CODE>
+
+// TODO(keren): double buffer?
+-:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>;
+-:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>;
+-:-:-:-:00 BAR.SYNC 0;
+-:-:-:-:00 LOP.XOR writeAs, writeAs, 4x<128*8*2>;
+-:-:-:-:00 LOP.XOR writeBs, writeBs, 4x<128*8*2>;
+
+// instruction align
+
+<CODE>
+    my $k_end = 16;
+    our %insert =
+    (
+        # P0 must be the topest
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+        j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n",
+        j0c62 => "-:G:D:-:00 \@P2 LDG.E.CI.128 loadA0, [trackA];\n",
+        j0c63 => "-:G:D:-:00 \@P3 LDG.E.CI.128 loadB0, [trackB];\n",
+
+        j1c47 => "-:-:-:-:00 \@P2 IADD   trackA0.CC, trackA0, 4x<8>;\n",
+        j1c53 => "-:-:-:-:00 \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+
+        j2c47 => "-:-:-:-:00 \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+        j2c53 => "-:-:-:-:00 \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j3c47 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j3c53 => "-:-:-:-:00 IADD32I k, k, -8;\n",
+
+        j5c47 => "T:-:D:S:00 TEXDEPBAR 0x1;\n", 
+        j5c53 => "-:-:D:S:00 \@P0 STS [writeAs + 4x<0*128>], loadA0;\n",
+        j5c61 => "-:-:D:S:00 \@P0 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j5c62 => "-:-:D:S:00 \@P0 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j5c63 => "-:-:D:S:00 \@P0 STS [writeAs + 4x<3*128>], loadA3;\n",
+
+        j6c47 => "T:-:D:S:00 TEXDEPBAR 0x0;\n",
+        j6c53 => "-:-:D:S:00 \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n",
+        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n",
+        j6c63 => "T:-:D:S:00 BAR.SYNC 0x0;\n",
+
+        j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n",
+        j7c53 => "-:-:-:-:00 \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n",
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n".
+                 "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
+
diff --git a/Kernel/SGEMM/Kepler/sgemm_nt_128x128.cu b/Kernel/SGEMM/Kepler/sgemm_nt_128x128.cu
new file mode 100644
index 0000000..663c184
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_nt_128x128.cu
@@ -0,0 +1,25 @@
+extern "C"
+__global__ void __launch_bounds__(256) sgemm_nt_128x128
+(
+ const float* param_A,
+ const float* param_B,
+ float*       param_C,
+ float        param_alpha,
+ float        param_beta,
+ int          param_lda,
+ int          param_ldb,
+ int          param_ldc,
+ int          param_m,
+ int          param_n,
+ int          param_k
+ ) {
+  __shared__ float share[128 * 8 * 4 + 32];
+
+  int tid = threadIdx.x;
+
+  share[tid] = 1;
+
+  __syncthreads();
+
+  param_C[tid] = share[255 - tid];
+}
diff --git a/Kernel/SGEMM/Kepler/sgemm_nt_128x128.sass b/Kernel/SGEMM/Kepler/sgemm_nt_128x128.sass
new file mode 100644
index 0000000..eb48e24
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_nt_128x128.sass
@@ -0,0 +1,247 @@
+# Kernel: sgemm_nt_128x128
+
+<CONSTANT_MAPPING>
+  addr_zero  : 4x<128*8*4>
+
+  gridDimA : c[0x0][0x14]
+  gridDimB : c[0x0][0x18]
+
+  param_A[0]  : c[0x0][0x140]
+  param_A[1]  : c[0x0][0x144]
+  param_B[0]  : c[0x0][0x148]
+  param_B[1]  : c[0x0][0x14c]
+  param_C[0]  : c[0x0][0x150]
+  param_C[1]  : c[0x0][0x154]
+  param_alpha : c[0x0][0x158]
+  param_beta  : c[0x0][0x15c]
+  param_lda   : c[0x0][0x160]
+  param_ldb   : c[0x0][0x164]
+  param_ldc   : c[0x0][0x168]
+  param_m     : c[0x0][0x16c]
+  param_n     : c[0x0][0x170]
+  param_k     : c[0x0][0x174]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+  64-95 ~ blkA, blkB, blkZ, tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, tid127, txa, txb, xmad_ta, xmad_tb, tid128
+
+  0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+  64-67 : j0Ay<0-3>
+  68-71 : j0Bx<0-3>
+  72-75 : j0Ay<4-7>
+  76-79 : j0Bx<4-7>
+  80-83 : j1Ay<0-3>
+  84-87 : j1Bx<0-3>
+  88-91 : j1Ay<4-7>
+  92-95 : j1Bx<4-7>
+
+  96-103 : loadA<0-3>,  loadB<0-3>
+  112-115 : trackA<0-1>, trackB<0-1>
+
+  116-122 ~ writeS, k, tidY, ta, tb, loop
+  123-127 ~ readAs, readBs, tid, k_and
+  128-135 ~ tmp_data, tmp_shl, tmp_param0, tmp_param1
+  144-150 ~ k1, k2, k3
+
+  64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+  64-75 : c<0-7>, d3, d2, d1, d0
+  76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+  86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkA, SR_CTAID.Y;
+-:-:-:-:00 S2R blkB, SR_CTAID.Z;
+-:-:-:-:00 S2R blkZ, SR_CTAID.X;
+
+-:-:-:-:00 MOV k,  param_k;
+-:-:-:-:00 MOV ldaz, RZ;
+-:-:-:-:00 MOV ldbz, RZ;
+-:-:-:-:00 MOV ldcz, RZ;
+-:-:-:-:00 LOP.AND tid1, tid,  1;
+
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+<CODE>
+  join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+-:-:-:-:00 MOV lda, param_lda;
+-:-:-:-:00 MOV ldb, param_ldb;
+
+// tidY  = tid1 << 2
+-:-:-:-:00 SHL tidY, tid1, 2;
+
+// tidX = tid >> 1
+-:-:-:-:00 SHR.U32 tidX, tid, 1;
+
+// trackA += 4 * ((blkA * 128 + tidX) * lda + tidY)
+-:-:-:-:00 ISCADD txa, blkA, tidX, 7;
+-:-:-:-:00 IMAD ta, lda, txa, tidY;
+-:-:-:-:00 IMAD ta, ldaz, blkZ, ta;
+-:-:-:-:00 MOV tmp_param0, param_A[0];
+-:-:-:-:00 MOV tmp_param1, param_A[1];
+-:-:-:-:00 SHL tmp_shl, ta, 0x2;
+-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1;
+
+// trackB += 4 * ((blkB * 128 + tidX) * ldb + tidY)
+-:-:-:-:00 ISCADD txb, blkB, tidX, 7;
+-:-:-:-:00 IMAD tb, ldb, txb, tidY;
+-:-:-:-:00 IMAD tb, ldbz, blkZ, tb;
+-:-:-:-:00 MOV tmp_param0, param_B[0];
+-:-:-:-:00 MOV tmp_param1, param_B[1];
+-:-:-:-:00 SHL tmp_shl, tb, 0x2;
+-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
+-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = 4 * (128 * tidY + tidX)
+-:-:-:-:00 ISCADD writeS, tidY, tidX, 7;
+-:-:-:-:00 SHL    writeS, writeS, 2;
+
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+-:-:-:-:00 LOP.AND readAs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readAs, readAs, 3;
+-:-:-:-:00 LOP.OR  readAs, readAs, tid1;
+-:-:-:-:00 SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+-:-:-:-:00 LOP.AND tid128, tid,  128;
+-:-:-:-:00 BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 SHR.U32 readBs, tid128, 4;
+-:-:-:-:00 LOP.OR  readBs, readBs, tid7;
+-:-:-:-:00 ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+-:-:-:-:00 NOP; 
+-:-:-:-:00 NOP; 
+
+REMAINDER:
+
+<CODE>
+    return q{
+      -:-:-:-:00 IADD k1, tidY, 1;
+      -:-:-:-:00 IADD k2, tidY, 2;
+      -:-:-:-:00 IADD k3, tidY, 3;
+
+      -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P5;
+      -:-:-:-:00 ISETP.LT.AND P1, PT, k1, k, P5;
+      -:-:-:-:00 ISETP.LT.AND P2, PT, k2, k, P5;
+      -:-:-:-:00 ISETP.LT.AND P3, PT, k3, k, P5;
+
+      -:-:-:-:00 @P0 LD.E.CI loadA0, [trackA + 4x<0>];
+      -:-:-:-:00 @P1 LD.E.CI loadA1, [trackA + 4x<1>];
+      -:-:-:-:00 @P2 LD.E.CI loadA2, [trackA + 4x<2>];
+      -:-:-:-:00 @P3 LD.E.CI loadA3, [trackA + 4x<3>];
+
+      -:-:-:-:00 @!P0 MOV loadA0, RZ;
+      -:-:-:-:00 @!P1 MOV loadA1, RZ;
+      -:-:-:-:00 @!P2 MOV loadA2, RZ;
+      -:-:-:-:00 @!P3 MOV loadA3, RZ;
+
+      -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P6;
+      -:-:-:-:00 ISETP.LT.AND P1, PT, k1, k, P6;
+      -:-:-:-:00 ISETP.LT.AND P2, PT, k2, k, P6;
+      -:-:-:-:00 ISETP.LT.AND P3, PT, k3, k, P6;
+
+      -:-:-:-:00 @P0 LD.E.CI loadB0, [trackB + 4x<0>];
+      -:-:-:-:00 @P1 LD.E.CI loadB1, [trackB + 4x<1>];
+      -:-:-:-:00 @P2 LD.E.CI loadB2, [trackB + 4x<2>];
+      -:-:-:-:00 @P3 LD.E.CI loadB3, [trackB + 4x<3>];
+
+      -:-:-:-:00 @!P0 MOV loadB0, RZ;
+      -:-:-:-:00 @!P1 MOV loadB1, RZ;
+      -:-:-:-:00 @!P2 MOV loadB2, RZ;
+      -:-:-:-:00 @!P3 MOV loadB3, RZ;
+
+      // bDoRemainder = k & 7 && k > 8
+      -:-:-:-:00 LOP.AND k_and, k, 7;
+      -:-:-:-:00 ISETP.EQ.AND P1, PT, k_and, RZ, PT;
+
+      -:G:-:-:15 STS [writeS + 4x<0*128>], loadA0;
+      -:G:-:-:15 STS [writeS + 4x<1*128>], loadA1;
+      -:G:-:-:15 STS [writeS + 4x<2*128>], loadA2;
+      -:G:-:-:15 STS [writeS + 4x<3*128>], loadA3;
+
+      -:G:-:-:15 STS [writeS + 4x< 8*128>], loadB0;
+      -:G:-:-:15 STS [writeS + 4x< 9*128>], loadB1;
+      -:G:-:-:15 STS [writeS + 4x<10*128>], loadB2;
+      -:G:-:-:15 STS [writeS + 4x<11*128>], loadB3;
+
+      -:-:-:-:00 IADD   trackA0.CC, trackA0, 4x<8>;
+      -:-:-:-:00 IADD.X trackA1, trackA1, RZ;
+
+      -:-:-:-:00 IADD   trackB0.CC, trackB0, 4x<8>;
+      -:-:-:-:00 IADD.X trackB1, trackB1, RZ;
+
+      -:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>;
+      -:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>;
+      -:-:-:-:00 BAR.SYNC 0;
+      -:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+      -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, !P1;
+    };
+</CODE>
+
+<CODE>
+    our %insert =
+    (
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, 16, P5;\n",
+        j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, 16, P6;\n",
+        j0c61 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, 16, PT;\n",
+        j0c62 => "-:-:-:-:00 \@P2 LD.E.CI loadA0, [trackA + 4x<0>];\n",
+
+        j1c47 => "-:-:-:-:00 \@P2 LD.E.CI loadA1, [trackA + 4x<1>];\n",
+        j1c53 => "-:-:-:-:00 \@P2 LD.E.CI loadA2, [trackA + 4x<2>];\n",
+        j1c61 => "-:-:-:-:00 \@P2 LD.E.CI loadA3, [trackA + 4x<3>];\n",
+        j1c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB0, [trackB + 4x<0>];\n",
+
+        j2c47 => "-:-:-:-:00 \@P3 LD.E.CI loadB1, [trackB + 4x<1>];\n",
+        j2c53 => "-:-:-:-:00 IADD32I k, k, -8;\n",
+        j2c61 => "-:-:-:-:00 \@P3 LD.E.CI loadB2, [trackB + 4x<2>];\n",
+        j2c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB3, [trackB + 4x<3>];\n",
+
+        j3c47 => "-:-:-:-:00 \@P2 IADD   trackA0.CC, trackA0, 4x<8>;\n",
+        j3c53 => "-:-:-:-:00 \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j4c47 => "-:-:D:S:02 \@P0 STS [writeS + 4x<0*128>], loadA0;\n",
+        j4c53 => "-:-:D:S:02 \@P0 STS [writeS + 4x<1*128>], loadA1;\n",
+        j4c61 => "-:-:D:-:07 \@P0 STS [writeS + 4x<2*128>], loadA2;\n",
+        j4c62 => "-:-:D:-:07 \@P0 STS [writeS + 4x<3*128>], loadA3;\n",
+
+        j5c47 => "-:-:D:S:02 \@P0 STS [writeS + 4x< 8*128>], loadB0;\n",
+        j5c53 => "-:-:D:S:02 \@P0 STS [writeS + 4x< 9*128>], loadB1;\n",
+        j5c61 => "-:-:D:-:07 \@P0 STS [writeS + 4x<10*128>], loadB2;\n",
+        j5c62 => "-:-:D:-:07 \@P0 STS [writeS + 4x<11*128>], loadB3;\n",
+
+        j6c47 => "-:-:-:-:00 \@P3 IADD   trackB0.CC, trackB0, 4x<8>;\n",
+        j6c53 => "-:-:-:-:00 \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n",
+        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n",
+        j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n",
+
+        j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n",
+
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n".
+                 "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
+
diff --git a/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.cu b/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.cu
new file mode 100644
index 0000000..7cf98a6
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.cu
@@ -0,0 +1,25 @@
+extern "C"
+__global__ void __launch_bounds__(256) sgemm_nt_128x128_vec
+(
+ const float* param_A,
+ const float* param_B,
+ float*       param_C,
+ float        param_alpha,
+ float        param_beta,
+ int          param_lda,
+ int          param_ldb,
+ int          param_ldc,
+ int          param_m,
+ int          param_n,
+ int          param_k
+ ) {
+  __shared__ float share[128 * 8 * 4 + 32];
+
+  int tid = threadIdx.x;
+
+  share[tid] = 1;
+
+  __syncthreads();
+
+  param_C[tid] = share[255 - tid];
+}
diff --git a/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.sass b/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.sass
new file mode 100644
index 0000000..4084d3d
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.sass
@@ -0,0 +1,222 @@
+# Kernel: sgemm_nt_128x128_vec
+
+<CONSTANT_MAPPING>
+  addr_zero  : 4x<128*8*4>
+
+  gridDimA : c[0x0][0x14]
+  gridDimB : c[0x0][0x18]
+
+  param_A[0]  : c[0x0][0x140]
+  param_A[1]  : c[0x0][0x144]
+  param_B[0]  : c[0x0][0x148]
+  param_B[1]  : c[0x0][0x14c]
+  param_C[0]  : c[0x0][0x150]
+  param_C[1]  : c[0x0][0x154]
+  param_alpha : c[0x0][0x158]
+  param_beta  : c[0x0][0x15c]
+  param_lda   : c[0x0][0x160]
+  param_ldb   : c[0x0][0x164]
+  param_ldc   : c[0x0][0x168]
+  param_m     : c[0x0][0x16c]
+  param_n     : c[0x0][0x170]
+  param_k     : c[0x0][0x174]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+  64-91 ~ blkA, blkB, blkZ, tidX, lda, ldb, ldaz, ldbz, tid1, tid7, txa, txb, tid128, ta, tb, tmp_shl
+  92-93 : tmp_param0, tmp_param1
+
+  0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+  64-67 : j0Ay<0-3>
+  68-71 : j0Bx<0-3>
+  72-75 : j0Ay<4-7>
+  76-79 : j0Bx<4-7>
+  80-83 : j1Ay<0-3>
+  84-87 : j1Bx<0-3>
+  88-91 : j1Ay<4-7>
+  92-95 : j1Bx<4-7>
+
+  96-103 : loadA<0-3>,  loadB<0-3>
+  104-107 : trackA<0-1>, trackB<0-1>
+
+  108-111 ~ writeS, k, k_and, tidY
+  117 ~ readAs
+  116 ~ readBs
+  115 ~ tid
+
+  64-75 ~ ldc, ci, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+  64-75 : c<0-7>, d3, d2, d1, d0
+  76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+  86-101 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkA, SR_CTAID.Y;
+-:-:-:-:00 S2R blkB, SR_CTAID.Z;
+-:-:-:-:00 S2R blkZ, SR_CTAID.X;
+
+-:-:-:-:00 MOV k,  param_k;
+-:-:-:-:00 MOV ldaz, RZ;
+-:-:-:-:00 MOV ldbz, RZ;
+-:-:-:-:00 MOV ldcz, RZ;
+-:-:-:-:00 LOP.AND tid1, tid,  1;
+
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+<CODE>
+  join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+-:-:-:-:00 MOV lda, param_lda;
+-:-:-:-:00 MOV ldb, param_ldb;
+
+// tidY  = tid1 << 2
+-:-:-:-:00 SHL tidY, tid1, 2;
+
+// tidX = tid >> 1
+-:-:-:-:00 SHR.U32 tidX, tid, 1;
+
+// trackA += 4 * ((blkA * 128 + tidX) * lda + tidY)
+-:-:-:-:00 ISCADD txa, blkA, tidX, 7;
+-:-:-:-:00 IMAD ta, lda, txa, tidY;
+-:-:-:-:00 IMAD ta, ldaz, blkZ, ta;
+-:-:-:-:00 MOV tmp_param0, param_A[0];
+-:-:-:-:00 MOV tmp_param1, param_A[1];
+-:-:-:-:00 SHL tmp_shl, ta, 0x2;
+-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1;
+
+// trackB += 4 * ((blkB * 128 + tidX) * ldb + tidY)
+-:-:-:-:00 ISCADD txb, blkB, tidX, 7;
+-:-:-:-:00 IMAD tb, ldb, txb, tidY;
+-:-:-:-:00 IMAD tb, ldbz, blkZ, tb;
+-:-:-:-:00 MOV tmp_param0, param_B[0];
+-:-:-:-:00 MOV tmp_param1, param_B[1];
+-:-:-:-:00 SHL tmp_shl, tb, 0x2;
+-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
+-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = 4 * (128 * tidY + tidX)
+-:-:-:-:00 ISCADD writeS, tidY, tidX, 7;
+-:-:-:-:00 SHL    writeS, writeS, 2;
+
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+// readAs = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+-:-:-:-:00 LOP.AND readAs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readAs, readAs, 3;
+-:-:-:-:00 LOP.OR  readAs, readAs, tid1;
+-:-:-:-:00 SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+-:-:-:-:00 LOP.AND tid128, tid,  128;
+-:-:-:-:00 BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 SHR.U32 readBs, tid128, 4;
+-:-:-:-:00 LOP.OR  readBs, readBs, tid7;
+-:-:-:-:00 ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+-:-:-:-:00 NOP; 
+-:-:-:-:00 NOP; 
+
+REMAINDER:
+
+<CODE>
+    return q{
+      // k must be multiple of 4
+      // load if tidY < k (tidY == 0 if mod 4 not mod 8)
+      -:-:-:-:00 ISETP.LT.AND P5, PT, tidY, k, P5;
+      -:-:-:-:00 @P5 LD.E.CI.128 loadA0, [trackA + 4x<0>];
+
+      // load if tidY < k (tidY == 0 if mod 4 not mod 8)
+      -:-:-:-:00 ISETP.LT.AND P6, PT, tidY, k, P6;
+      -:-:-:-:00 @P6 LD.E.CI.128 loadB0, [trackB + 4x<0>];
+
+      -:-:-:-:00 @!P5 LDS.128 loadA0, [RZ + addr_zero];
+      -:-:-:-:00 @!P6 LDS.128 loadB0, [RZ + addr_zero];
+
+      // bDoRemainder = k & 7 && k > 8
+      -:-:-:-:00 LOP.AND k_and, k, 7;
+      -:-:-:-:00 ISETP.EQ.AND P1, PT, k_and, RZ, PT;
+      -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, !P1;
+
+      -:-:-:-:00 STS [writeS + 4x<0*128>], loadA0;
+      -:-:-:-:00 STS [writeS + 4x<1*128>], loadA1;
+      -:-:-:-:00 STS [writeS + 4x<2*128>], loadA2;
+      -:-:-:-:00 STS [writeS + 4x<3*128>], loadA3;
+
+      -:-:-:-:00 STS [writeS + 4x< 8*128>], loadB0;
+      -:-:-:-:00 STS [writeS + 4x< 9*128>], loadB1;
+      -:-:-:-:00 STS [writeS + 4x<10*128>], loadB2;
+      -:-:-:-:00 STS [writeS + 4x<11*128>], loadB3;
+
+      -:-:-:-:00 IADD   trackA0.CC, trackA0, 4x<8>;
+      -:-:-:-:00 IADD.X trackA1, trackA1, RZ;
+
+      -:-:-:-:00 IADD   trackB0.CC, trackB0, 4x<8>;
+      -:-:-:-:00 IADD.X trackB1, trackB1, RZ;
+
+      -:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>;
+      -:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>;
+      -:-:-:-:00 BAR.SYNC 0;
+      -:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;
+    };
+</CODE>
+
+<CODE>
+    my $k_end = 16;
+    our %insert =
+    (
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+        j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n",
+        j0c61 => "-:G:D:-:07 \@P2 LDG.E.CI.128 loadA0, [trackA];\n",
+        j0c62 => "-:G:D:-:07 \@P3 LDG.E.CI.128 loadB0, [trackB];\n",
+
+        j1c47 => "-:-:-:-:00 \@P2 IADD trackA0.CC, trackA0, 4x<8>;\n",
+        j1c53 => "-:-:-:-:00 \@P3 IADD trackB0.CC, trackB0, 4x<8>;\n",
+
+        j2c47 => "-:-:-:-:00 \@P2 IADD.X trackA1, trackA1, RZ;\n",
+        j2c53 => "-:-:-:-:00 \@P3 IADD.X trackB1, trackB1, RZ;\n",
+
+        j3c47 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j3c53 => "-:-:-:-:00 IADD32I k, k, -8;\n",
+
+        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n",
+        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n",
+        j6c63 => "T:-:D:S:00 TEXDEPBAR 0x0;\n".
+                 "-:G:-:-:15 \@P0 STS [writeS + 4x<8*128>], loadB0;\n".
+                 "-:G:-:-:15 \@P0 STS [writeS + 4x<9*128>], loadB1;\n".
+                 "-:G:-:-:15 \@P0 STS [writeS + 4x<10*128>], loadB2;\n".
+                 "-:G:-:-:15 \@P0 STS [writeS + 4x<11*128>], loadB3;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "-:G:-:-:15 \@P0 STS [writeS + 4x<0*128>], loadA0;\n".
+                 "-:G:-:-:15 \@P0 STS [writeS + 4x<1*128>], loadA1;\n".
+                 "-:G:-:-:15 \@P0 STS [writeS + 4x<2*128>], loadA2;\n".
+                 "-:G:-:-:15 \@P0 STS [writeS + 4x<3*128>], loadA3;\n".
+                 "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n".
+                 "-:-:-:-:00 NOP;\n".
+                 "T:-:D:S:00 \@P0 BAR.SYNC 0;\n",
+
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n".
+                 "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
+
+
diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x128.cu b/Kernel/SGEMM/Kepler/sgemm_tn_128x128.cu
new file mode 100644
index 0000000..c17da1a
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x128.cu
@@ -0,0 +1,25 @@
+extern "C"
+__global__ void __launch_bounds__(256) sgemm_tn_128x128
+(
+ const float* param_A,
+ const float* param_B,
+ float*       param_C,
+ float        param_alpha,
+ float        param_beta,
+ int          param_lda8,
+ int          param_ldb8,  
+ int          param_ldc,
+ int          param_m,
+ int          param_n,
+ int          param_k
+ ) {
+  __shared__ float share[128 * 8 * 4 + 32];
+
+  int tid = threadIdx.x;
+
+  share[tid] = 1;
+
+  __syncthreads();
+
+  param_C[tid] = share[255 - tid];
+}
diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x128.sass b/Kernel/SGEMM/Kepler/sgemm_tn_128x128.sass
new file mode 100644
index 0000000..0c03a6e
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x128.sass
@@ -0,0 +1,241 @@
+# Kernel: sgemm_tn_128x128
+
+<CONSTANT_MAPPING>
+  addr_zero  : 4x<128*8*4>
+
+  gridDimA : c[0x0][0x14]
+  gridDimB : c[0x0][0x18]
+
+  param_A[0]  : c[0x0][0x140]
+  param_A[1]  : c[0x0][0x144]
+  param_B[0]  : c[0x0][0x148]
+  param_B[1]  : c[0x0][0x14c]
+  param_C[0]  : c[0x0][0x150]
+  param_C[1]  : c[0x0][0x154]
+  param_alpha : c[0x0][0x158]
+  param_beta  : c[0x0][0x15c]
+  param_lda8  : c[0x0][0x160]
+  param_ldb8  : c[0x0][0x164]
+  param_ldc   : c[0x0][0x168]
+  param_m     : c[0x0][0x16c]
+  param_n     : c[0x0][0x170]
+  param_k     : c[0x0][0x174]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+  64-95 ~ blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, tid1, tid7, tidX, blk, tid31, tid128
+
+  0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+
+  64-67 : j0Ay<0-3>
+  68-71 : j0Bx<0-3>
+  72-75 : j0Ay<4-7>
+  76-79 : j0Bx<4-7>
+  80-83 : j1Ay<0-3>
+  84-87 : j1Bx<0-3>
+  88-91 : j1Ay<4-7>
+  92-95 : j1Bx<4-7>
+
+  96-103 : loadA<0-3>, loadB<0-3>
+
+  104-107 : trackA<0-1>, trackB<0-1>
+
+  108-121 ~ writeS, lda8, k, tidY, txa, txb, ta, tb, loop
+  122-127 ~ readAs, readBs, tid
+  128-135 ~ tmp_data, tmp_shl, tmp_param0, tmp_param1
+  144-155 ~ x<1-3>, y<1-3>
+
+  64-75 ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+  64-75 : c<0-7>, d3, d2, d1, d0
+  76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+  86-121 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkA, SR_CTAID.Y;
+-:-:-:-:00 S2R blkB, SR_CTAID.Z;
+-:-:-:-:00 S2R blkZ, SR_CTAID.X;
+
+-:-:-:-:00 MOV k, param_k;
+-:-:-:-:00 MOV ldaz, RZ;
+-:-:-:-:00 MOV ldbz, RZ;
+-:-:-:-:00 MOV ldcz, RZ;
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+<CODE>
+  join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+
+// tidX = (tid & 31) << 2
+// tidY = (tid >> 5) & 7
+-:-:-:-:00 LOP.AND tid31, tid, 31;
+-:-:-:-:00 SHL     tidX, tid31, 2;
+-:-:-:-:00 BFE.U32 tidY, tid, 0x305; // 3 bits at position 5
+
+-:-:-:-:00 MOV lda, param_lda8;
+-:-:-:-:00 MOV ldb, param_ldb8;
+-:-:-:-:00 SHR.U32 lda, lda, 5;
+-:-:-:-:00 SHR.U32 ldb, ldb, 5;
+
+// trackA += (blkA*128 + lda*tidY + tidX) * 2
+-:-:-:-:00 ISCADD txa, blkA, tidX, 7;
+-:-:-:-:00 IMAD ta, lda, tidY, txa;
+-:-:-:-:00 IMAD ta, ldaz, blkZ, ta;
+-:-:-:-:00 MOV tmp_param0, param_A[0];
+-:-:-:-:00 MOV tmp_param1, param_A[1];
+-:-:-:-:00 SHL tmp_shl, ta, 0x2;
+-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// trackB += (blkB*128 + ldb*tidY + tidX) * 2
+-:-:-:-:00 ISCADD txb, blkB, tidX, 7;
+-:-:-:-:00 IMAD tb, ldb, tidY, txb;
+-:-:-:-:00 IMAD tb, ldbz, blkZ, tb;
+-:-:-:-:00 MOV tmp_param0, param_B[0];
+-:-:-:-:00 MOV tmp_param1, param_B[1];
+-:-:-:-:00 SHL tmp_shl, tb, 0x2;
+-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1;
+
+-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = (128*tidY + tidX) * 4
+-:-:-:-:00 ISCADD  writeS, tidY, tidX, 7;
+-:-:-:-:00 SHL     writeS, writeS, 2;
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+-:-:-:-:00 LOP.AND tid1,   tid,  1;
+-:-:-:-:00 LOP.AND readAs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readAs, readAs, 3;
+-:-:-:-:00 LOP.OR  readAs, readAs, tid1;
+-:-:-:-:00 SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+-:-:-:-:00 LOP.AND tid128, tid,  128;
+-:-:-:-:00 BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 SHR.U32 readBs, tid128, 4;
+-:-:-:-:00 LOP.OR  readBs, readBs, tid7;
+-:-:-:-:00 ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+
+REMAINDER:
+
+<CODE>
+    return q{
+      // doLoadA = tidY < k && txa < m
+      // doLoadB = tidY < k && txb < n
+      -:-:-:-:00 IADD x1, txa, 1;
+      -:-:-:-:00 IADD x2, txa, 2;
+      -:-:-:-:00 IADD x3, txa, 3;
+      -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P5;
+      -:-:-:-:00 ISETP.LT.AND P1, PT, x1, param_m, P0;
+      -:-:-:-:00 ISETP.LT.AND P2, PT, x2, param_m, P0;
+      -:-:-:-:00 ISETP.LT.AND P3, PT, x3, param_m, P0;
+
+      -:-:-:-:00 @P0 LD.E.CI loadA0, [trackA + 4x<0>];
+      -:-:-:-:00 @P1 LD.E.CI loadA1, [trackA + 4x<1>];
+      -:-:-:-:00 @P2 LD.E.CI loadA2, [trackA + 4x<2>];
+      -:-:-:-:00 @P3 LD.E.CI loadA3, [trackA + 4x<3>];
+
+      -:-:-:-:00 @!P0 MOV loadA0, RZ;
+      -:-:-:-:00 @!P1 MOV loadA1, RZ;
+      -:-:-:-:00 @!P2 MOV loadA2, RZ;
+      -:-:-:-:00 @!P3 MOV loadA3, RZ;
+
+      -:-:-:-:00 IADD y1, txb, 1;
+      -:-:-:-:00 IADD y2, txb, 2;
+      -:-:-:-:00 IADD y3, txb, 3;
+      -:-:-:-:00 ISETP.LT.AND P0, PT, tidY, k, P6;
+      -:-:-:-:00 ISETP.LT.AND P1, PT, y1, param_n, P0;
+      -:-:-:-:00 ISETP.LT.AND P2, PT, y2, param_n, P0;
+      -:-:-:-:00 ISETP.LT.AND P3, PT, y3, param_n, P0;
+
+      -:-:-:-:00 @P0 LD.E.CI loadB0, [trackB + 4x<0>];
+      -:-:-:-:00 @P1 LD.E.CI loadB1, [trackB + 4x<1>];
+      -:-:-:-:00 @P2 LD.E.CI loadB2, [trackB + 4x<2>];
+      -:-:-:-:00 @P3 LD.E.CI loadB3, [trackB + 4x<3>];
+
+      -:-:-:-:00 @!P0 MOV loadB0, RZ;
+      -:-:-:-:00 @!P1 MOV loadB1, RZ;
+      -:-:-:-:00 @!P2 MOV loadB2, RZ;
+      -:-:-:-:00 @!P3 MOV loadB3, RZ;
+
+      -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, PT;
+    };
+</CODE>
+
+-:-:-:-:00 STS.128 [writeS + 4x<0*128>], loadA0;
+
+-:-:-:-:00 IADD   trackA0.CC, trackA0, param_lda8;
+-:-:-:-:00 IADD.X trackA1, trackA1, RZ;
+
+-:-:-:-:00 STS.128 [writeS + 4x<8*128>], loadB0;
+
+-:-:-:-:00 IADD trackB0.CC, trackB0, param_ldb8;
+
+-:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>;
+-:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>;
+-:-:-:-:00 BAR.SYNC 0;
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+-:-:-:-:00 IADD.X trackB1, trackB1, RZ;
+
+<CODE>
+    my $k_end = 24;
+    our %insert =
+    (
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+        j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n",
+        j0c61 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j0c62 => "-:-:-:-:00 \@P2 LD.E.CI loadA0, [trackA + 4x<0>];\n",
+        j0c63 => "-:-:-:-:00 \@P2 LD.E.CI loadA1, [trackA + 4x<1>];\n",
+
+        j1c47 => "-:-:-:-:00 \@P2 LD.E.CI loadA2, [trackA + 4x<2>];\n",
+        j1c53 => "-:-:-:-:00 \@P2 LD.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j2c47 => "-:-:-:-:00 \@P3 LD.E.CI loadB0, [trackB + 4x<0>];\n",
+        j2c53 => "-:-:-:-:00 \@P3 LD.E.CI loadB1, [trackB + 4x<1>];\n",
+        j2c61 => "-:-:-:-:00 \@P3 LD.E.CI loadB2, [trackB + 4x<2>];\n",
+        j2c62 => "-:-:-:-:00 \@P3 LD.E.CI loadB3, [trackB + 4x<3>];\n",
+
+        j3c47 => "-:-:-:-:00 \@P2 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j3c53 => "-:-:-:-:00 \@P2 IADD.X trackA1, trackA1, RZ;\n",
+
+        j4c53 => "-:-:D:S:02 \@P0 STS.128 [writeS + 4x<0*128>], loadA0;\n",
+
+        j5c53 => "-:-:D:S:02 \@P0 STS.128 [writeS + 4x<8*128>], loadB0;\n",
+
+        j6c47 => "-:-:-:-:00 \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j6c53 => "-:-:-:-:00 \@P3 IADD.X trackB1, trackB1, RZ;\n",
+        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n",
+        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n",
+        j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n",
+        
+        j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n",
+        j7c53 => "-:-:-:-:00 IADD32I k, k, -8;\n",
+ 
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n".
+                 "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.cu b/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.cu
new file mode 100644
index 0000000..28aa136
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.cu
@@ -0,0 +1,26 @@
+extern "C"
+__global__ void __launch_bounds__(256) sgemm_tn_128x128_vec
+(
+ const float* param_A,
+ const float* param_B,
+ float*       param_C,
+ float        param_alpha,
+ float        param_beta,
+ int          param_lda8,
+ int          param_ldb8,  
+ int          param_ldc,
+ int          param_m,
+ int          param_n,
+ int          param_k
+ ) {
+  __shared__ float share[128 * 8 * 4 + 32];
+
+  int tid = threadIdx.x;
+
+  share[tid] = 1;
+
+  __syncthreads();
+
+  param_C[tid] = share[255 - tid];
+}
+
diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.sass b/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.sass
new file mode 100644
index 0000000..bc896ba
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.sass
@@ -0,0 +1,212 @@
+# Kernel: sgemm_tn_128x128_vec
+
+<CONSTANT_MAPPING>
+  addr_zero  : 4x<128*8*4>
+
+  gridDimA : c[0x0][0x14]
+  gridDimB : c[0x0][0x18]
+
+  param_A[0]  : c[0x0][0x140]
+  param_A[1]  : c[0x0][0x144]
+  param_B[0]  : c[0x0][0x148]
+  param_B[1]  : c[0x0][0x14c]
+  param_C[0]  : c[0x0][0x150]
+  param_C[1]  : c[0x0][0x154]
+  param_alpha : c[0x0][0x158]
+  param_beta  : c[0x0][0x15c]
+  param_lda8  : c[0x0][0x160]
+  param_ldb8  : c[0x0][0x164]
+  param_ldc   : c[0x0][0x168]
+  param_m     : c[0x0][0x16c]
+  param_n     : c[0x0][0x170]
+  param_k     : c[0x0][0x174]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+  64-91 ~ blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, tid1, tid7, tidX, tid31, tid128, txa, txb, ta, tb, tmp_shl
+  92-93 : tmp_param<0-1>
+
+  0-63 : czero<00-63>
+
+   1,  4, 17, 20, 33, 36, 49, 52 : cx<0-7>y0
+   5,  0, 21, 16, 37, 32, 53, 48 : cx<0-7>y1
+   3,  6, 19, 22, 35, 38, 51, 54 : cx<0-7>y2
+   7,  2, 23, 18, 39, 34, 55, 50 : cx<0-7>y3
+   9, 12, 25, 28, 41, 44, 57, 60 : cx<0-7>y4
+  13,  8, 29, 24, 45, 40, 61, 56 : cx<0-7>y5
+  11, 14, 27, 30, 43, 46, 59, 62 : cx<0-7>y6
+  15, 10, 31, 26, 47, 42, 63, 58 : cx<0-7>y7
+
+  64-67 : j0Ay<0-3>
+  68-71 : j0Bx<0-3>
+  72-75 : j0Ay<4-7>
+  76-79 : j0Bx<4-7>
+  80-83 : j1Ay<0-3>
+  84-87 : j1Bx<0-3>
+  88-91 : j1Ay<4-7>
+  92-95 : j1Bx<4-7>
+
+  96-103 : loadA<0-3>, loadB<0-3>
+
+  104-107 : trackA<0-1>, trackB<0-1>
+
+  108-111 ~ writeS, k, k_and, tidY
+  117 ~ readAs
+  116 ~ readBs
+  115 ~ tid
+
+  64-75 ~ ldc, ci, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+  64-75 : c<0-3>, c<4-7>, d3, d2, d1, d0
+  76-85 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+  86-101 ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkA, SR_CTAID.Y;
+-:-:-:-:00 S2R blkB, SR_CTAID.Z;
+-:-:-:-:00 S2R blkZ, SR_CTAID.X;
+
+-:-:-:-:00 MOV k, param_k;
+-:-:-:-:00 MOV ldaz, RZ;
+-:-:-:-:00 MOV ldbz, RZ;
+-:-:-:-:00 MOV ldcz, RZ;
+-:-:-:-:00 STS.128 [RZ + addr_zero], RZ;
+<CODE>
+  join('', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [RZ + addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+
+// tidX = (tid & 31) << 2
+// tidY = (tid >> 5) & 7
+-:-:-:-:00 LOP.AND tid31, tid, 31;
+-:-:-:-:00 SHL     tidX, tid31, 2;
+-:-:-:-:00 BFE.U32 tidY, tid, 0x305; // 3 bits at position 5
+
+-:-:-:-:00 MOV lda, param_lda8;
+-:-:-:-:00 MOV ldb, param_ldb8;
+-:-:-:-:00 SHR.U32 lda, lda, 5;
+-:-:-:-:00 SHR.U32 ldb, ldb, 5;
+
+// trackA += (blkA*128 + lda*tidY + tidX) * 2
+-:-:-:-:00 ISCADD txa, blkA, tidX, 7;
+-:-:-:-:00 IMAD ta, lda, tidY, txa;
+-:-:-:-:00 IMAD ta, ldaz, blkZ, ta;
+-:-:-:-:00 MOV tmp_param0, param_A[0];
+-:-:-:-:00 MOV tmp_param1, param_A[1];
+-:-:-:-:00 SHL tmp_shl, ta, 0x2;
+-:-:-:-:00 IADD trackA0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackA1, RZ, tmp_param1;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// trackB += (blkB*128 + ldb*tidY + tidX) * 2
+-:-:-:-:00 ISCADD txb, blkB, tidX, 7;
+-:-:-:-:00 IMAD tb, ldb, tidY, txb;
+-:-:-:-:00 IMAD tb, ldbz, blkZ, tb;
+-:-:-:-:00 MOV tmp_param0, param_B[0];
+-:-:-:-:00 MOV tmp_param1, param_B[1];
+-:-:-:-:00 SHL tmp_shl, tb, 0x2;
+-:-:-:-:00 IADD trackB0.CC, tmp_shl, tmp_param0;
+-:-:-:-:00 IADD.X trackB1, RZ, tmp_param1;
+
+-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = (128*tidY + tidX) * 4
+-:-:-:-:00 ISCADD  writeS, tidY, tidX, 7;
+-:-:-:-:00 SHL     writeS, writeS, 2;
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+-:-:-:-:00 LOP.AND tid1,   tid,  1;
+-:-:-:-:00 LOP.AND readAs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readAs, readAs, 3;
+-:-:-:-:00 LOP.OR  readAs, readAs, tid1;
+-:-:-:-:00 SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+-:-:-:-:00 LOP.AND tid128, tid,  128;
+-:-:-:-:00 BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 SHR.U32 readBs, tid128, 4;
+-:-:-:-:00 LOP.OR  readBs, readBs, tid7;
+-:-:-:-:00 ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+-:-:-:-:00 NOP;
+
+REMAINDER:
+
+<CODE>
+    return q{
+      // bDoRemainder = k & 7 && k > 8
+      -:-:-:-:00 LOP.AND k_and, k, 7;
+      -:-:-:-:00 ISETP.EQ.AND P1, PT, k_and, RZ, PT;
+      -:-:-:-:00 ISETP.GT.AND P1, PT, k, 8, !P1;
+
+      // doLoad = tidY < k && txa|txb < n|m
+      -:-:-:-:00 ISETP.LT.AND P2, PT, tidY, k, P5;
+      -:-:-:-:00 ISETP.LT.AND P3, PT, tidY, k, P6;
+
+      -:-:-:-:00 @P2 LD.E.CI.128 loadA0, [trackA];
+      -:-:-:-:00 @P3 LD.E.CI.128 loadB0, [trackB];
+
+      -:-:-:-:00 @!P2 LDS.128 loadA0, [RZ + addr_zero];
+      -:-:-:-:00 @!P3 LDS.128 loadB0, [RZ + addr_zero];
+      // Vec 4 and scalar loads
+    };
+
+</CODE>
+
+-:-:-:-:00 STS.128 [writeS + 4x<0*128>], loadA0;
+
+-:-:-:-:00 IADD   trackA0.CC, trackA0, param_lda8;
+-:-:-:-:00 IADD.X trackA1, trackA1, RZ;
+
+-:-:-:-:00 STS.128 [writeS + 4x<8*128>], loadB0;
+
+-:-:-:-:00 IADD trackB0.CC, trackB0, param_ldb8;
+
+-:-:-:-:00 LOP.XOR readAs, readAs, 4x<128*8*2>;
+-:-:-:-:00 LOP.XOR readBs, readBs, 4x<128*8*2>;
+-:-:-:-:00 BAR.SYNC 0;
+-:-:-:-:00 LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+-:-:-:-:00 IADD.X trackB1, trackB1, RZ;
+
+<CODE>
+    my $k_end = 16;
+    our %insert =
+    (
+        j0c47 => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+        j0c53 => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, $k_end, P6;\n",
+        j0c61 => "-:G:D:-:07 \@P2 LDG.E.128 loadA, [trackA];\n",
+        j0c62 => "-:G:D:-:07 \@P3 LDG.E.128 loadB, [trackB];\n",
+
+        j1c47 => "-:-:-:-:00 \@P2 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j1c53 => "-:-:-:-:00 \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+
+        j2c47 => "-:-:-:-:00 \@P2 IADD.X trackA1, trackA1, RZ;\n",
+        j2c53 => "-:-:-:-:00 \@P3 IADD.X trackB1, trackB1, RZ;\n",
+
+        j3c47 => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j3c53 => "-:-:-:-:00 IADD32I k, k, -8;\n",
+
+        j5c63 => "T:-:D:S:00 TEXDEPBAR 0x0;\n",
+        j6c47 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<0*128>], loadA0;\n",
+        j6c53 => "-:-:D:S:00 \@P0 STS.128 [writeS + 4x<8*128>], loadB0;\n",
+
+        j6c61 => "-:-:-:-:00 \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n",
+        j6c62 => "-:-:-:-:00 \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n",
+        j6c63 => "T:-:D:S:00 \@P0 BAR.SYNC 0;\n",
+                 
+        j7c47 => "-:-:-:-:00 \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n",
+        j7c63 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n" .
+                 "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Kepler/sgemm_tn_128x32.sass b/Kernel/SGEMM/Kepler/sgemm_tn_128x32.sass
new file mode 100644
index 0000000..a5324ad
--- /dev/null
+++ b/Kernel/SGEMM/Kepler/sgemm_tn_128x32.sass
@@ -0,0 +1,422 @@
+# Kernel: sgemm_tn_128x32
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*16*2 + 32*16*2>
+    szShareA  : 128*16
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+-:-:-:-:00 S2R tid,  SR_TID.X;
+-:-:-:-:00 S2R blkA, SR_CTAID.Y;
+-:-:-:-:00 S2R blkB, SR_CTAID.Z;
+-:-:-:-:00 S2R blkZ, SR_CTAID.X;
+
+-:-:-:-:00 MOV k,    param_k;
+-:-:-:-:00 MOV lda,  param_lda8;
+-:-:-:-:00 MOV ldb,  param_ldb8;
+-:-:-:-:00 SHR.U32 lda, lda, 5;
+-:-:-:-:00 SHR.U32 ldb, ldb, 5;
+-:-:-:-:00 MOV ldaz, param_ldaz;
+-:-:-:-:00 MOV ldbz, param_ldbz;
+-:-:-:-:00 SHL lda16, lda, 6;
+-:-:-:-:00 SHL ldb16, ldb, 6;
+-:-:-:-:00 SHL lda4,  lda, 2;
+
+-:-:-:-:00 STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX = (tid & 31) << 2
+// tidAY = (tid >> 5)
+-:-:-:-:00 LOP.AND tidAX, tid,   31;
+-:-:-:-:00 SHL     tidAX, tidAX, 2;
+-:-:-:-:00 SHR.U32 tidAY, tid,   5;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+-:-:-:-:00 LOP.AND tidBX, tid,   7;
+-:-:-:-:00 SHL     tidBX, tidBX, 2;
+-:-:-:-:00 SHR.U32 tidBY, tid,   3;
+
+// trackA += (blkA*128 + tidAX + lda*tidAY) * 4
+-:-:-:-:00 ISCADD   txa, blkA, tidAX, 7;
+-:-:-:-:00 XMAD.LO2 ta0, lda,  tidAY, txa;
+-:-:-:-:00 XMAD.LO2 ta0, ldaz, blkZ,  ta0;
+-:-:-:-:00 IADD     ta1, ta0, lda4;
+-:-:-:-:00 IADD     ta2, ta1, lda4;
+-:-:-:-:00 IADD     ta3, ta2, lda4;
+
+-:-:-:-:00 LEA      track0A0.CC, ta0, param_A[0],     2;
+-:-:-:-:00 LEA.HI.X track0A1,    ta0, param_A[1], RZ, 2;
+-:-:-:-:00 LEA      track1A0.CC, ta1, param_A[0],     2;
+-:-:-:-:00 LEA.HI.X track1A1,    ta1, param_A[1], RZ, 2;
+-:-:-:-:00 LEA      track2A0.CC, ta2, param_A[0],     2;
+-:-:-:-:00 LEA.HI.X track2A1,    ta2, param_A[1], RZ, 2;
+-:-:-:-:00 LEA      track3A0.CC, ta3, param_A[0],     2;
+-:-:-:-:00 LEA.HI.X track3A1,    ta3, param_A[1], RZ, 2;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4
+-:-:-:-:00 ISCADD   txb, blkB, tidBX,  5;
+-:-:-:-:00 XMAD.LO2 tb,  ldb,  tidBY, txb;
+-:-:-:-:00 XMAD.LO2 tb,  ldbz, blkZ,  tb;
+-:-:-:-:00 LEA      trackB0.CC, tb, param_B[0],     2;
+-:-:-:-:00 LEA.HI.X trackB1,    tb, param_B[1], RZ, 2;
+
+// writeAs = (tidAY*128 + tidAX) * 4
+-:-:-:-:00 ISCADD writeAs, tidAY, tidAX, 7;
+-:-:-:-:00 ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+-:-:-:-:00 ISCADD writeBs, tidBY, tidBX, 5;
+-:-:-:-:00 ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+-:-:-:-:00 LOP.AND tid1,   tid,    1;
+-:-:-:-:00 LOP.AND readAs, tid,    0x70;
+-:-:-:-:00 SHR.U32 readAs, readAs, 3;
+-:-:-:-:00 LOP.OR  readAs, readAs, tid1;
+-:-:-:-:00 SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+-:-:-:-:00 BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+-:-:-:-:00 ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+-:-:-:-:00 MOV32I swapBuf, -4x<szShareA + szShareB>;
+
+REMAINDER:
+
+-:-:-:-:00 IADD tidAY1, tidAY, 4;
+-:-:-:-:00 IADD tidAY2, tidAY, 8;
+-:-:-:-:00 IADD tidAY3, tidAY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
+-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+-:-:-:-:00 ISETP.LT.AND P0, PT, tidAY,  k, P5;
+-:-:-:-:00 ISETP.LT.AND P1, PT, tidAY1, k, P5;
+-:-:-:-:00 ISETP.LT.AND P2, PT, tidAY2, k, P5;
+-:-:-:-:00 ISETP.LT.AND P3, PT, tidAY3, k, P5;
+-:-:-:-:00 ISETP.LT.AND P4, PT, tidBY,  k, P6;
+
+-:-:-:-:00 @P0 LDG.E.CI.128 load0A, [track0A];
+-:-:-:-:00 @P1 LDG.E.CI.128 load1A, [track1A];
+-:-:-:-:00 @P2 LDG.E.CI.128 load2A, [track2A];
+-:-:-:-:00 @P3 LDG.E.CI.128 load3A, [track3A];
+-:-:-:-:00 @P4 LDG.E.CI.128 loadB,  [trackB];
+
+-:-:-:-:00 @!P0 LDS.U.128 load0A, [addr_zero];
+-:-:-:-:00 @!P1 LDS.U.128 load1A, [addr_zero];
+-:-:-:-:00 @!P2 LDS.U.128 load2A, [addr_zero];
+-:-:-:-:00 @!P3 LDS.U.128 load3A, [addr_zero];
+-:-:-:-:00 @!P4 LDS.U.128 loadB,  [addr_zero];
+
+   } : q{
+
+-:-:-:-:00 IADD txa1,  txa,  1;
+-:-:-:-:00 IADD txa2,  txa,  2;
+-:-:-:-:00 IADD txa3,  txa,  3;
+
+-:-:-:-:00 ISETP.LT.AND P4, PT, tidAY, k, PT;
+-:-:-:-:00 ISETP.LT.AND P0, PT, txa,  param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P4;
+-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P4;
+
+-:-:-:-:00 @P0 LDG.E.CI load0A0, [track0A + 4x<0>];
+-:-:-:-:00 @P1 LDG.E.CI load0A1, [track0A + 4x<1>];
+-:-:-:-:00 @P2 LDG.E.CI load0A2, [track0A + 4x<2>];
+-:-:-:-:00 @P3 LDG.E.CI load0A3, [track0A + 4x<3>];
+
+-:-:-:-:00 @!P0 MOV load0A0, RZ;
+-:-:-:-:00 @!P1 MOV load0A1, RZ;
+-:-:-:-:00 @!P2 MOV load0A2, RZ;
+-:-:-:-:00 @!P3 MOV load0A3, RZ;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, tidAY1, k, PT;
+-:-:-:-:00 ISETP.LT.AND P0, PT, txa,  param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P5;
+
+-:-:-:-:00 @P0 LDG.E.CI load1A0, [track1A + 4x<0>];
+-:-:-:-:00 @P1 LDG.E.CI load1A1, [track1A + 4x<1>];
+-:-:-:-:00 @P2 LDG.E.CI load1A2, [track1A + 4x<2>];
+-:-:-:-:00 @P3 LDG.E.CI load1A3, [track1A + 4x<3>];
+
+-:-:-:-:00 @!P0 MOV load1A0, RZ;
+-:-:-:-:00 @!P1 MOV load1A1, RZ;
+-:-:-:-:00 @!P2 MOV load1A2, RZ;
+-:-:-:-:00 @!P3 MOV load1A3, RZ;
+
+-:-:-:-:00 ISETP.LT.AND P6, PT, tidAY2, k, PT;
+-:-:-:-:00 ISETP.LT.AND P0, PT, txa,  param_m, P6;
+-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P6;
+-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P6;
+-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P6;
+
+-:-:-:-:00 @P0 LDG.E.CI load2A0, [track2A + 4x<0>];
+-:-:-:-:00 @P1 LDG.E.CI load2A1, [track2A + 4x<1>];
+-:-:-:-:00 @P2 LDG.E.CI load2A2, [track2A + 4x<2>];
+-:-:-:-:00 @P3 LDG.E.CI load2A3, [track2A + 4x<3>];
+
+-:-:-:-:00 @!P0 MOV load2A0, RZ;
+-:-:-:-:00 @!P1 MOV load2A1, RZ;
+-:-:-:-:00 @!P2 MOV load2A2, RZ;
+-:-:-:-:00 @!P3 MOV load2A3, RZ;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, tidAY3, k, PT;
+-:-:-:-:00 ISETP.LT.AND P0, PT, txa,  param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P5;
+-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P5;
+
+-:-:-:-:00 @P0 LDG.E.CI load3A0, [track3A + 4x<0>];
+-:-:-:-:00 @P1 LDG.E.CI load3A1, [track3A + 4x<1>];
+-:-:-:-:00 @P2 LDG.E.CI load3A2, [track3A + 4x<2>];
+-:-:-:-:00 @P3 LDG.E.CI load3A3, [track3A + 4x<3>];
+
+-:-:-:-:00 @!P0 MOV load3A0, RZ;
+-:-:-:-:00 @!P1 MOV load3A1, RZ;
+-:-:-:-:00 @!P2 MOV load3A2, RZ;
+-:-:-:-:00 @!P3 MOV load3A3, RZ;
+
+-:-:-:-:00 IADD txb1,  txb,  1;
+-:-:-:-:00 IADD txb2,  txb,  2;
+-:-:-:-:00 IADD txb3,  txb,  3;
+
+-:-:-:-:00 ISETP.LT.AND P4, PT, tidBY, k, PT;
+-:-:-:-:00 ISETP.LT.AND P0, PT, txb,  param_n, P4;
+-:-:-:-:00 ISETP.LT.AND P1, PT, txb1, param_n, P4;
+-:-:-:-:00 ISETP.LT.AND P2, PT, txb2, param_n, P4;
+-:-:-:-:00 ISETP.LT.AND P3, PT, txb3, param_n, P4;
+
+-:-:-:-:00 @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+-:-:-:-:00 @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+-:-:-:-:00 @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+-:-:-:-:00 @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+-:-:-:-:00 @!P0 MOV loadB0, RZ;
+-:-:-:-:00 @!P1 MOV loadB1, RZ;
+-:-:-:-:00 @!P2 MOV loadB2, RZ;
+-:-:-:-:00 @!P3 MOV loadB3, RZ;
+
+-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
+-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;
+    };
+</CODE>
+
+-:-:-:-:00 ISETP.GE.AND P2, PT, k, 32, P5;
+-:-:-:-:00 ISETP.GE.AND P3, PT, k, 32, P5;
+-:-:-:-:00 ISETP.GE.AND P5, PT, k, 32, P5;
+-:-:-:-:00 ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+-:-:-:-:00 LOP.AND.NZ P1, RZ, k, 15;
+
+-:-:-:-:00 STS.128 [writeAs + 4x<0*128>], load0A;
+-:-:-:-:00 IADD   track0A0.CC, track0A0, lda16;
+-:-:-:-:00 IADD.X track0A1,    track0A1, RZ;
+
+-:-:-:-:00 STS.128 [writeAs + 4x<4*128>], load1A;
+-:-:-:-:00 IADD   track1A0.CC, track1A0, lda16;
+-:-:-:-:00 IADD.X track1A1,    track1A1, RZ;
+
+-:-:-:-:00 STS.128 [writeAs + 4x<8*128>], load2A;
+-:-:-:-:00 IADD   track2A0.CC, track2A0, lda16;
+-:-:-:-:00 IADD.X track2A1,    track2A1, RZ;
+
+-:-:-:-:00 STS.128 [writeAs + 4x<12*128>], load3A;
+-:-:-:-:00 IADD   track3A0.CC, track3A0, lda16;
+-:-:-:-:00 IADD.X track3A1,    track3A1, RZ;
+
+-:-:-:-:00 STS.128 [writeBs], loadB;
+-:-:-:-:00 IADD   trackB0.CC, trackB0, ldb16;
+
+-:-:-:-:00 ISETP.GT.AND P1, PT, k, 16, P1;
+
+-:-:-:-:00 IADD readBs,  readBs, -swapBuf;
+-:-:-:-:00 IADD readAs,  readAs, -swapBuf;
+-:-:-:-:00 BAR.SYNC 0;
+-:-:-:-:00 IADD writeBs, writeBs, swapBuf;
+-:-:-:-:00 IADD writeAs, writeAs, swapBuf;
+-:-:-:-:00 IADD swapBuf, RZ, -swapBuf;
+
+-:-:-:-:00 IADD.X trackB1,    trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+-:-:-:-:00 @P5 LDG.E.CI.128 load0A, [track0A];
+-:-:-:-:00 @P5 LDG.E.CI.128 load1A, [track1A];
+-:-:-:-:00 @P5 LDG.E.CI.128 load2A, [track2A];
+-:-:-:-:00 @P5 LDG.E.CI.128 load3A, [track3A];
+-:-:-:-:00 @P6 LDG.E.CI.128 loadB,  [trackB];
+   } : q{
+-:-:-:-:00 @P5 LDG.E.CI load0A0, [track0A + 4x<0>];
+-:-:-:-:00 @P5 LDG.E.CI load0A1, [track0A + 4x<1>];
+-:-:-:-:00 @P5 LDG.E.CI load0A2, [track0A + 4x<2>];
+-:-:-:-:00 @P5 LDG.E.CI load0A3, [track0A + 4x<3>];
+
+-:-:-:-:00 @P5 LDG.E.CI load1A0, [track1A + 4x<0>];
+-:-:-:-:00 @P5 LDG.E.CI load1A1, [track1A + 4x<1>];
+-:-:-:-:00 @P5 LDG.E.CI load1A2, [track1A + 4x<2>];
+-:-:-:-:00 @P5 LDG.E.CI load1A3, [track1A + 4x<3>];
+
+-:-:-:-:00 @P5 LDG.E.CI load2A0, [track2A + 4x<0>];
+-:-:-:-:00 @P5 LDG.E.CI load2A1, [track2A + 4x<1>];
+-:-:-:-:00 @P5 LDG.E.CI load2A2, [track2A + 4x<2>];
+-:-:-:-:00 @P5 LDG.E.CI load2A3, [track2A + 4x<3>];
+
+-:-:-:-:00 @P5 LDG.E.CI load3A0, [track3A + 4x<0>];
+-:-:-:-:00 @P5 LDG.E.CI load3A1, [track3A + 4x<1>];
+-:-:-:-:00 @P5 LDG.E.CI load3A2, [track3A + 4x<2>];
+-:-:-:-:00 @P5 LDG.E.CI load3A3, [track3A + 4x<3>];
+
+-:-:-:-:00 @P6 LDG.E.CI loadB0, [trackB + 4x<0>];
+-:-:-:-:00 @P6 LDG.E.CI loadB1, [trackB + 4x<1>];
+-:-:-:-:00 @P6 LDG.E.CI loadB2, [trackB + 4x<2>];
+-:-:-:-:00 @P6 LDG.E.CI loadB3, [trackB + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 0;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "-:-:-:-:00 IADD k, k, -16;\n",
+        j0c14  => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n",
+        j5c6   => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n",
+        j7c6   => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n",
+        j9c6   => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n",
+        j11c6  => "-:-:-:-:00 \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "-:-:-:-:00 \@P2 IADD   track0A0.CC, track0A0, lda16;\n",
+        j3c13  => "-:-:-:-:00 \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "-:-:-:-:00 \@P3 IADD   track1A0.CC, track1A0, lda16;\n",
+        j5c13  => "-:-:-:-:00 \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "-:-:-:-:00 \@P5 IADD   track2A0.CC, track2A0, lda16;\n",
+        j7c13  => "-:-:-:-:00 \@P5 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "-:-:-:-:00 \@P5 IADD   track3A0.CC, track3A0, lda16;\n",
+        j9c13  => "-:-:-:-:00 \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "-:-:-:-:00 \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "-:-:-:-:00 \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "-:-:-:-:00 ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "-:-:-:-:00 ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "-:-:-:-:00 \@P0 BAR.SYNC 0;\n" .
+                  "-:-:-:-:00 \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "-:-:-:-:00 \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "-:-:-:-:00 \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "-:-:-:-:00 \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "-:-:-:-:00 \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "-:-:-:-:00 \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j5c29  => "-:-:-:-:00 \@P3 LDG.E.CI.128 load1A, [track1A];\n",
+                j9c29  => "-:-:-:-:00 \@P5 LDG.E.CI.128 load2A, [track2A];\n",
+                j9c31  => "-:-:-:-:00 \@P5 LDG.E.CI.128 load3A, [track3A];\n",
+                j11c29 => "-:-:-:-:00 \@P6 LDG.E.CI.128 loadB,  [trackB];\n",
+            ) :
+            (
+                j3c29  => "-:-:-:-:00 \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n",
+                j3c31  => "-:-:-:-:00 \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n",
+                j4c1   => "-:-:-:-:00 \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n",
+                j4c3   => "-:-:-:-:00 \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n",
+
+                j5c29  => "-:-:-:-:00 \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n",
+                j5c31  => "-:-:-:-:00 \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n",
+                j6c1   => "-:-:-:-:00 \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n",
+                j6c3   => "-:-:-:-:00 \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n",
+
+                j9c29  => "-:-:-:-:00 \@P5 LDG.E.CI load2A0, [track2A + 4x<0>];\n",
+                j9c31  => "-:-:-:-:00 \@P5 LDG.E.CI load2A1, [track2A + 4x<1>];\n",
+                j10c1  => "-:-:-:-:00 \@P5 LDG.E.CI load2A2, [track2A + 4x<2>];\n",
+                j10c3  => "-:-:-:-:00 \@P5 LDG.E.CI load2A3, [track2A + 4x<3>];\n",
+
+                j10c8  => "-:-:-:-:00 \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n",
+                j10c10 => "-:-:-:-:00 \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n",
+                j10c12 => "-:-:-:-:00 \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n",
+                j10c14 => "-:-:-:-:00 \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n",
+
+                j11c29 => "-:-:-:-:00 \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+                j11c31 => "-:-:-:-:00 \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+                j12c1  => "-:-:-:-:00 \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+                j12c3  => "-:-:-:-:00 \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n" .
+                  "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n"
+    );
+    return ;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/hgemm_common_128x128.sass b/Kernel/SGEMM/Maxwell/hgemm_common_128x128.sass
new file mode 100644
index 0000000..d699483
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_common_128x128.sass
@@ -0,0 +1,412 @@
+# hgemm_common_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+[-
+
+our $int16;
+
+sub convert_in {
+    return $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+}
+
+
+sub convert_out {
+    return $int16 ? 'F2I.S16.F32': 'F2F.F16.F32';
+}
+
+
+sub scale_int16 {
+    return $int16? q{
+--:-:-:-:1      FMUL c0, c0, param_scale;
+--:-:-:-:1      FMUL c1, c1, param_scale;
+--:-:-:-:1      FMUL c2, c2, param_scale;
+--:-:-:-:0      FMUL c3, c3, param_scale;
+    } : "";
+}
+
+
+sub max_abs1 {
+    return $int16? q{
+--:-:-:-:1 @!P0 MOV cs0, RZ;
+--:-:-:-:1 @!P1 MOV cs1, RZ;
+--:-:-:-:1 @!P2 MOV cs2, RZ;
+--:-:-:-:1 @!P3 MOV cs3, RZ;
+
+--:-:-:-:1  @P0 VABSDIFF.S16.S16.MRG_16L cs0, c0, RZ, RZ;
+--:-:-:-:1  @P1 VABSDIFF.S16.S16.MRG_16L cs1, c1, RZ, RZ;
+--:-:-:-:1  @P2 VABSDIFF.S16.S16.MRG_16L cs2, c2, RZ, RZ;
+--:-:-:-:1  @P3 VABSDIFF.S16.S16.MRG_16L cs3, c3, RZ, RZ;
+    } : "";
+}
+
+
+sub max_abs2 {
+    return $int16? q{
+<SCHEDULE_BLOCK>
+
+// a = abs(a)
+--:-:-:-:1  @P0 VABSDIFF.S16.S16.MRG_16H cs0, c0, RZ, cs0;
+--:-:-:-:1  @P1 VABSDIFF.S16.S16.MRG_16H cs1, c1, RZ, cs1;
+--:-:-:-:1  @P2 VABSDIFF.S16.S16.MRG_16H cs2, c2, RZ, cs2;
+--:-:-:-:1  @P3 VABSDIFF.S16.S16.MRG_16H cs3, c3, RZ, cs3;
+
+// max = max(c,d,max(a,b,max)) ...
+--:-:-:-:1      VMNMX.UD.U16.U16.MX.MAX maxabs, cs0, cs0.H1, maxabs;
+--:-:-:-:1      VMNMX.UD.U16.U16.MX.MAX maxabs, cs1, cs1.H1, maxabs;
+--:-:-:-:1      VMNMX.UD.U16.U16.MX.MAX maxabs, cs2, cs2.H1, maxabs;
+--:-:-:-:1      VMNMX.UD.U16.U16.MX.MAX maxabs, cs3, cs3.H1, maxabs;
+</SCHEDULE_BLOCK>
+
+    } : "";
+}
+
+
+sub butterfly {
+    return $int16 ? q{
+--:-:-:-:0      LOP.AND.Z P0, RZ, tid, 31;
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x10, 0x1f;
+01:-:-:-:4      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x8,  0x1f;
+01:-:-:-:4      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x4,  0x1f;
+01:-:-:-:4      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:-:-:0      MOV Stats0, param_Stats[0];
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x2,  0x1f;
+01:-:-:-:4      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:-:-:0      MOV Stats1, param_Stats[1];
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x1,  0x1f;
+01:-:-:-:2      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:-:-:1  @P0 RED.E.MAX [Stats], maxabs;
+    } : "";
+}
+
+-]
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>];
+
+LOOP:
+
+[+
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $yield  = $c == 32 ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
++]
+
+--:-:-:-:1      IADD loop, loop, 1;
+--:-:-:-:1      IADD ta, ta, param_ldaz;
+--:-:-:-:1      IADD tb, tb, param_ldbz;
+--:-:-:-:3      MOV  k, param_k;
+--:-:-:-:1      ISETP.LT.AND P1, PT, loop, param_loops, PT;
+--:-:-:-:6      LEA      trackA0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 1;
+--:-:-:-:6      LEA      trackB0.CC, tb, param_B[0],     1;
+--:-:-:-:0      LEA.HI.X trackB1,    tb, param_B[1], RZ, 1;
+--:-:-:Y:5  @P1 BRA.U REMAINDER;
+
+<SCHEDULE_BLOCK>
+
+// writeCs = (readAs / 4) * 128 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;
+
+--:-:-:-:1      LOP.AND tid_31,  tid, 31;
+--:-:-:-:1      LOP.AND tid_96,  tid, 96;
+--:-:-:-:1      LOP.AND tid_128, tid, 128;
+
+// cx = tid31 | (tid_128 >> 2);
+--:-:-:-:1      SHR.U32  cx00, tid_128, 2;
+--:-:-:-:1      LOP.OR   cx00, tid_31,  cx00;
+
+// readCs = ((tid_96 << 4) | cx) << 2;
+--:-:-:-:1      SHL      readCs, tid_96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, cx00;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// cx += blkB*128;
+--:-:-:-:1      ISCADD  cx00, blkB, cx00, 7;
+--:-:-:-:1      IADD    cx64, cx00, 64;
+
+// cy = blkA*128 + (tid_96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid_96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc, cx00, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 1;
+
+--:-:-:-:1      SHL  ldc1, ldc, 1;
+--:-:-:-:1      SHL  ldc4, ldc, 3;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 7;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+--:-:-:-:1      MOV maxabs, RZ;
+
+--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+[+ butterfly() +]
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.S16 d0, [C00y0 + 2x<00>];
+--:-:2:-:1  @P1 LDG.E.S16 d1, [C00y0 + 2x<64>];
+--:-:3:-:1  @P2 LDG.E.S16 d2, [C04y0 + 2x<00>];
+--:-:4:-:1  @P3 LDG.E.S16 d3, [C04y0 + 2x<64>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c4, c4, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c5, c5, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c6, c6, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c7, c7, RZ, !PT;
+
+--:-:-:-:5      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+</SCHEDULE_BLOCK>
+
+--:-:-:-:3      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], c4;
+
+--:-:-:-:0      IADD cy00, cy00, 1;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<0*128 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<0*128 + 64>];
+--:-:-:-:1      LDS c2, [readCs + 4x<1*128 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<1*128 + 64>];
+
+--:-:-:-:0      IADD cy04, cy04, 1;
+
+01:-:1:-:1  @P6 [+ convert_in() +] d0, d0;
+02:-:2:-:1  @P6 [+ convert_in() +] d1, d1;
+04:-:3:-:1  @P6 [+ convert_in() +] d2, d2;
+08:-:4:-:1  @P6 [+ convert_in() +] d3, d3;
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:3  @P6 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+01:-:-:-:1  @P0 STG.E.S16 [C00y0 + 2x<00>], c0;
+02:5:-:-:1  @P1 STG.E.S16 [C00y0 + 2x<64>], c1;
+04:-:-:-:1  @P2 STG.E.S16 [C04y0 + 2x<00>], c2;
+08:6:-:-:1  @P3 STG.E.S16 [C04y0 + 2x<64>], c3;
+
+[+ max_abs1() +]
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.S16 d0, [C08y0 + 2x<00>];
+--:-:2:-:1  @P1 LDG.E.S16 d1, [C08y0 + 2x<64>];
+--:-:3:-:1  @P2 LDG.E.S16 d2, [C12y0 + 2x<00>];
+--:-:4:-:1  @P3 LDG.E.S16 d3, [C12y0 + 2x<64>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+--:-:-:-:2      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:2      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+</SCHEDULE_BLOCK>
+
+10:-:-:-:4      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD   cy08, cy08, 1;
+--:-:-:-:1      IADD   cy12, cy12, 1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+20:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:0      IADD.X C04y1,    C04y1, RZ;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<2*128 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<2*128 + 64>];
+--:-:-:-:1      LDS c2, [readCs + 4x<3*128 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*128 + 64>];
+
+01:-:1:-:4  @P6 [+ convert_in() +] d0, d0;
+02:-:2:-:4  @P6 [+ convert_in() +] d1, d1;
+04:-:3:-:4  @P6 [+ convert_in() +] d2, d2;
+08:-:4:-:1  @P6 [+ convert_in() +] d3, d3;
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:3  @P6 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+01:-:-:-:1  @P0 STG.E.S16 [C08y0 + 2x<00>], c0;
+02:5:-:-:1  @P1 STG.E.S16 [C08y0 + 2x<64>], c1;
+04:-:-:-:1  @P2 STG.E.S16 [C12y0 + 2x<00>], c2;
+08:6:-:-:1  @P3 STG.E.S16 [C12y0 + 2x<64>], c3;
+
+[+ max_abs2() +]
+
+10:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+20:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/hgemm_common_128x32.sass b/Kernel/SGEMM/Maxwell/hgemm_common_128x32.sass
new file mode 100644
index 0000000..9d4860a
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_common_128x32.sass
@@ -0,0 +1,246 @@
+# hgemm_common_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+    
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 32 + readBs;
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 3;
+
+// readCs = ((tid & 96) << 2) | (tid & 31)   << 2;
+--:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      LOP.AND tid96,  tid,  96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 2;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid31;
+--:-:-:-:1      ISCADD cx, blkB, tid31, 5;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc,    cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 1;
+
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P4, RZ, flags, 2;
+
+--:-:-:-:1      SHL  ldc1, ldc, 1;
+--:-:-:-:1      SHL  ldc4, ldc, 3;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 7;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.U16 d0, [C00y];
+--:-:2:-:1  @P1 LDG.E.U16 d1, [C04y];
+--:-:3:-:1  @P2 LDG.E.U16 d2, [C08y];
+--:-:4:-:1  @P3 LDG.E.U16 d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:0      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*32>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*32>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*32>];
+</SCHEDULE_BLOCK>
+
+01:-:1:-:1  @P5 F2F.F32.F16 d0, d0;
+02:-:2:-:1  @P5 F2F.F32.F16 d1, d1;
+04:-:3:-:1  @P5 F2F.F32.F16 d2, d2;
+08:-:4:-:1  @P5 F2F.F32.F16 d3, d3;
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P5 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+01:1:-:-:1  @P0 STG.E.CG.U16 [C00y], c0;
+02:2:-:-:1  @P1 STG.E.CG.U16 [C04y], c1;
+04:3:-:-:1  @P2 STG.E.CG.U16 [C08y], c2;
+08:4:-:-:1  @P3 STG.E.CG.U16 [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/hgemm_common_128x64.sass b/Kernel/SGEMM/Maxwell/hgemm_common_128x64.sass
new file mode 100644
index 0000000..a375c03
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_common_128x64.sass
@@ -0,0 +1,318 @@
+# hgemm_common_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 +  00>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 +  32>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 +  00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 +  32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $yield  = $c == 32 ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:1:-:1      S2R threadId, SR_TID.X;
+--:-:2:-:1      S2R blockA, SR_CTAID.Y;
+--:-:3:-:1      S2R blockB, SR_CTAID.Z;
+--:-:4:-:1      S2R blockZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 64 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 4;
+
+// readCs = ((threadId & 96) << 3) | (threadId & 31)   << 2;
+01:-:-:-:1      LOP.AND tid31,  threadId,  31;
+01:-:-:-:1      LOP.AND tid96,  threadId,  96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 3;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx00 = blkB*64 + tid31;
+04:-:-:-:1      ISCADD cx00, blockB, tid31, 6;
+--:-:-:-:1      IADD   cx32, cx00, 32;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+02:-:-:-:1      ISCADD  cy00, blockA, cy00, 7;
+
+// C += (cy*ldc + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc, cx00, xmad_c;
+08:-:-:-:1      XMAD.LO2 ci, ldcz, blockZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 1;
+
+--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+
+--:-:-:-:1      SHL  ldc1, ldc, 1;
+--:-:-:-:1      SHL  ldc4, ldc, 3;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 7;
+
+</SCHEDULE_BLOCK>
+
+//--:-:1:-:2      I2F.F32.U32 temp, threadId;
+//01:-:-:-:1      F2F.F16.F32 temp, temp;
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.S16 d0, [C00y0 + 2x<00>];
+--:-:2:-:1  @P1 LDG.E.S16 d1, [C00y0 + 2x<32>];
+--:-:3:-:1  @P2 LDG.E.S16 d2, [C04y0 + 2x<00>];
+--:-:4:-:1  @P3 LDG.E.S16 d3, [C04y0 + 2x<32>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c4, c4, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c5, c5, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c6, c6, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c7, c7, RZ, !PT;
+
+--:-:-:-:5      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], c4;
+
+--:-:-:-:0      IADD cy00, cy00, 1;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<0*64 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<1*64 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<1*64 + 32>];
+
+--:-:-:-:0      IADD cy04, cy04, 1;
+
+01:-:1:-:1  @P6 F2F.F32.F16 d0, d0;
+02:-:2:-:1  @P6 F2F.F32.F16 d1, d1;
+04:-:3:-:1  @P6 F2F.F32.F16 d2, d2;
+08:-:4:-:1  @P6 F2F.F32.F16 d3, d3;
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P6 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, P6;
+
+// Stochastic Round flag
+--:-:-:-:1      LOP.AND.NZ   P6, RZ, flags, 1;
+
+01:-:-:-:1  @P0 STG.E.S16 [C00y0 + 2x<00>], c0;
+02:5:-:-:1  @P1 STG.E.S16 [C00y0 + 2x<32>], c1;
+04:-:-:-:1  @P2 STG.E.S16 [C04y0 + 2x<00>], c2;
+08:6:-:-:1  @P3 STG.E.S16 [C04y0 + 2x<32>], c3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.S16 d0, [C08y0 + 2x<00>];
+--:-:2:-:1  @P1 LDG.E.S16 d1, [C08y0 + 2x<32>];
+--:-:3:-:1  @P2 LDG.E.S16 d2, [C12y0 + 2x<00>];
+--:-:4:-:1  @P3 LDG.E.S16 d3, [C12y0 + 2x<32>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, PT;
+
+--:-:-:-:2      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:2      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+</SCHEDULE_BLOCK>
+
+10:-:-:-:4      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD   cy08, cy08, 1;
+--:-:-:-:1      IADD   cy12, cy12, 1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+20:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:0      IADD.X C04y1,    C04y1, RZ;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<2*64 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<3*64 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*64 + 32>];
+
+01:-:1:-:1  @P6 F2F.F32.F16 d0, d0;
+02:-:2:-:1  @P6 F2F.F32.F16 d1, d1;
+04:-:3:-:1  @P6 F2F.F32.F16 d2, d2;
+08:-:4:-:1  @P6 F2F.F32.F16 d3, d3;
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P6 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+01:-:-:-:1  @P0 STG.E.S16 [C08y0 + 2x<00>], c0;
+02:5:-:-:1  @P1 STG.E.S16 [C08y0 + 2x<32>], c1;
+04:-:-:-:1  @P2 STG.E.S16 [C12y0 + 2x<00>], c2;
+08:6:-:-:1  @P3 STG.E.S16 [C12y0 + 2x<32>], c3;
+
+10:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+20:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/hgemm_common_32x128.sass b/Kernel/SGEMM/Maxwell/hgemm_common_32x128.sass
new file mode 100644
index 0000000..3661b08
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_common_32x128.sass
@@ -0,0 +1,244 @@
+# Kernel: hgemm_common_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32  + 16 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*32  + 16 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+    
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32  + 16 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 128 + readBs;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;
+
+// readCs = tid * 4;
+--:-:-:-:1      SHL readCs, tid, 2;
+
+// cx = blkB*128 + tid;
+--:-:-:-:1      ISCADD cx, blkB, tid, 7;
+
+// cy = blkA*32
+--:-:-:-:1      SHL cy00, blkA, 5;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 1;
+
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P4, RZ, flags, 2;
+
+--:-:-:-:1      SHL  ldc1, ldc, 1;
+--:-:-:-:1      SHL  ldc4, ldc, 3;
+--:-:-:-:1      ISCADD ldc12, ldc, -ldc4, 5;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  12;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  12;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  12;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  12;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.U16 d0, [C00y];
+--:-:2:-:1  @P1 LDG.E.U16 d1, [C04y];
+--:-:3:-:1  @P2 LDG.E.U16 d2, [C08y];
+--:-:4:-:1  @P3 LDG.E.U16 d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:0      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*128>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*128>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*128>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*128>];
+</SCHEDULE_BLOCK>
+
+01:-:1:-:1  @P5 F2F.F32.F16 d0, d0;
+02:-:2:-:1  @P5 F2F.F32.F16 d1, d1;
+04:-:3:-:1  @P5 F2F.F32.F16 d2, d2;
+08:-:4:-:1  @P5 F2F.F32.F16 d3, d3;
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P5 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+01:1:-:-:1  @P0 STG.E.CG.U16 [C00y], c0;
+02:2:-:-:1  @P1 STG.E.CG.U16 [C04y], c1;
+04:3:-:-:1  @P2 STG.E.CG.U16 [C08y], c2;
+08:4:-:-:1  @P3 STG.E.CG.U16 [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_128x128.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_128x128.sass
new file mode 100644
index 0000000..0b4f460
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nn_128x128.sass
@@ -0,0 +1,393 @@
+# Kernel: hgemm_nn_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $int16;
+our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+sub convert_in {return $convert;}
+
+sub int16_params {
+    return $int16 ? q{
+param_Stats[0]  : c[0x0][0x190]
+param_Stats[1]  : c[0x0][0x194]
+param_scale     : c[0x0][0x198]
+    } : "";
+}
+-]
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+    [+ int16_params() +]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, tid31, tid128, txa, xmad_ta, xmad_tb, k<1-3>, x<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-105  : loadB<0-3>, loadA<0-5>
+
+    106-109 : trackA<0-1>, trackB<0-1>
+
+    110-118 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop
+    119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ
+
+    64-75   ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128
+
+    64-79   : c<0-7>, d3, d2, d1, d0, cs<0-3>
+    64-65   : Stats<0-1>
+    80-89   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    90-118  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs
+
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,   param_k;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      MOV loop, RZ;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
++]
+
+01:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+
+// tidAY  = (tid & 1) << 2
+--:-:-:-:1      LOP.AND tid1,  tid,  1;
+--:-:-:-:1      SHL     tidAY, tid1, 2;
+
+// tidAX = tid >> 1
+--:-:-:-:1      SHR.U32 tidAX, tid,   1;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ,  ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5) & 7
+--:-:-:-:1      SHL     tidBX, tid31, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,  0x305; // 3 bits at position 5
+
+// trackB += (blkB*128 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 7;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeAs = 4 * (128 * tidAY + tidAX)
+--:-:-:-:1      ISCADD  writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD  writeAs, writeAs, 4x<128*8*2>, 2;
+
+
+// writeBs = (128*tidBY + tidBX) * 4
+--:-:-:-:1      ISCADD  writeBs, tidBY, tidBX, 7;
+--:-:-:-:1      ISCADD  writeBs, writeBs, 4x<128*8*3>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:-:-:2      ISETP.LT.AND P3, PT, tidBY, k, P6;
+--:-:-:Y:b      ISETP.LT.AND P2, PT, tidAY, k, P5;
+
+--:-:4:-:2  @P3 LDG.E.CI.64 loadB0, [trackB];
+--:-:2:-:1  @P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];
+--:-:2:-:1  @P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];
+
+--:-:-:-:0      PSETP.AND.AND P4, PT, PT, PT, PT;
+
+--:-:5:-:1 @!P3 LDS.U.64 loadB0, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.64 loadA0, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.64 loadA4, [addr_zero];
+    } : q{
+
+<SCHEDULE_BLOCK>
+// doLoad0 = tidBY < k
+--:-:-:-:1      IADD x1, txb, 1;
+--:-:-:-:1      IADD x2, txb, 2;
+--:-:-:-:1      IADD x3, txb, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_n, P0;
+
+--:-:4:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>];
+--:-:4:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>];
+--:-:4:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>];
+--:-:4:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+
+--:-:-:-:1      IADD k1, tidAY, 1;
+--:-:-:-:1      IADD k2, tidAY, 2;
+--:-:-:-:1      IADD k3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+</SCHEDULE_BLOCK>
+    };
++]
+
+[+
+    our $vec;
+    our $convert;
+    return $vec ? qq{
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 7;
+
+18:-:-:-:4      $convert loadB3, loadB1.H1;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      $convert loadB2, loadB1.H0;
+--:-:-:-:4      $convert loadB1, loadB0.H1;
+--:-:4:-:2      $convert loadB0, loadB0.H0;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs], loadB0;
+
+22:-:-:-:4      $convert loadA3, loadA1.H1;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<16>;
+--:-:2:-:4      $convert loadA2, loadA1.H0;
+--:-:-:-:4      $convert loadA1, loadA0.H1;
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+--:-:3:-:1      $convert loadA0, loadA0.H0;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<3*128>], loadA3;
+--:-:-:-:1      STS [writeAs + 4x<2*128>], loadA2;
+04:-:-:-:1      STS [writeAs + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<0*128>], loadA0;
+    } : qq{
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+08:-:-:-:4      $convert loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      $convert loadB1, loadB1;
+--:-:-:-:4      $convert loadB2, loadB2;
+--:-:4:-:2      $convert loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs], loadB0;
+
+02:-:-:-:4      $convert loadA0, loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<8>;
+--:-:2:-:4      $convert loadA1, loadA1;
+--:-:-:-:4      $convert loadA2, loadA2;
+--:-:3:-:1      $convert loadA3, loadA3;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeAs + 4x<1*128>], loadA1;
+04:-:-:-:1      STS [writeAs + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*128>], loadA3;
+    };
++]
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      LOP.XOR writeAs, writeAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR writeBs, writeBs, 4x<128*8*2>;
+
+
+
+[+
+    our $vec;
+    our $convert;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P4, PT, !P4, PT, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND  P0, PT, k, $k_end, PT;\n",
+        j0c15 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P4, P5;\n",
+
+        j0c10 => "--:-:2:-:1  \@P3 LDG.E.CI.64 loadB0, [trackB];\n",
+
+        j0c28 => "--:-:5:-:1  \@P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];\n",
+        j0c30 => "20:4:6:-:1  \@P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];\n",
+
+        j4c5  => "--:-:-:-:1 \@!P4 $convert loadA3, loadA5.H1;\n",
+        j4c9  => "--:-:-:-:1 \@!P4 $convert loadA2, loadA5.H0;\n",
+        j4c13 => "--:-:-:-:1 \@!P4 $convert loadA1, loadA4.H1;\n",
+        j4c17 => "--:-:-:-:1 \@!P4 $convert loadA0, loadA4.H0;\n",
+
+        j5c5  => "02:-:-:-:1  \@P0 $convert loadB3, loadB1.H1;\n",
+        j5c9  => "--:-:-:-:1  \@P0 $convert loadB2, loadB1.H0;\n",
+        j5c13 => "--:-:-:-:1  \@P0 $convert loadB1, loadB0.H1;\n",
+        j5c17 => "--:-:2:-:1  \@P0 $convert loadB0, loadB0.H0;\n",
+
+        j5c35 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c5  => "10:-:2:-:1  \@P4 $convert loadA3, loadA1.H1;\n",
+        j6c9  => "--:-:3:-:1  \@P4 $convert loadA2, loadA1.H0;\n",
+        j6c13 => "--:-:4:-:1  \@P4 $convert loadA1, loadA0.H1;\n",
+        j6c17 => "--:-:5:-:1  \@P4 $convert loadA0, loadA0.H0;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeAs + 4x<3*128>], loadA3;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128>], loadA0;\n",
+
+        j6c11 => "08:-:-:-:1  \@P4 IADD   trackA0.CC, trackA0, 2x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P4 IADD.X trackA1,    trackA1, RZ;\n",
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        j0c10 => "--:-:2:-:1  \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j0c29 => "--:-:6:-:1  \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n",
+        j0c31 => "--:-:6:-:1  \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n",
+        j0c33 => "--:-:6:-:1  \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n",
+        j0c35 => "--:-:6:-:1  \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n",
+
+        j5c8  => "02:-:-:-:1  \@P3 $convert loadB0, loadB0;\n",
+        j5c12 => "--:-:-:-:1  \@P3 $convert loadB1, loadB1;\n",
+        j5c16 => "--:-:-:-:1  \@P3 $convert loadB2, loadB2;\n",
+        j5c20 => "--:-:2:-:1  \@P3 $convert loadB3, loadB3;\n",
+
+        j5c39 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c5  => "20:-:2:-:1  \@P2 $convert loadA0, loadA0;\n",
+        j6c9  => "--:-:3:-:1  \@P2 $convert loadA1, loadA1;\n",
+        j6c13 => "--:-:4:-:1  \@P2 $convert loadA2, loadA2;\n",
+        j6c17 => "--:-:5:-:1  \@P2 $convert loadA3, loadA3;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeAs + 4x<0*128>], loadA0;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS [writeAs + 4x<3*128>], loadA3;\n",
+
+        j6c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 2x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+            )
+        ),
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:0      IADD32I k, k, -8;\n" .
+                 "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
++]
+
+<INCLUDE file="hgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_128x32.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_128x32.sass
new file mode 100644
index 0000000..33a4a9a
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nn_128x32.sass
@@ -0,0 +1,590 @@
+# Kernel: hgemm_nn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + 32*16*2>
+    szShareA  : (128*16 + 32)
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, shiftAX
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb16, ldb, 5;
+--:-:-:-:1      SHL lda32, lda, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD txa00, blkA, tidAX, 7;
+--:-:-:-:1      IADD   txa32, txa00, 32;
+--:-:-:-:1      IADD   txa64, txa00, 64;
+--:-:-:-:1      IADD   txa96, txa00, 96;
+
+--:-:-:-:1      XMAD.LO  ta00, lda,  txa00,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta00, ldaz, blkZ,  ta00;
+--:-:-:-:1      IADD     ta32, ta00, lda32;
+--:-:-:-:1      IADD     ta64, ta32, lda32;
+--:-:-:-:1      IADD     ta96, ta64, lda32;
+
+--:-:-:-:1      LEA      track0A0.CC, ta00, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta00, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track1A0.CC, ta32, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track1A1,    ta32, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track2A0.CC, ta64, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track2A1,    ta64, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track3A0.CC, ta96, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track3A1,    ta96, param_A[1], RZ, 1;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 1;
+
+// writeAs = (tidAY*128 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.CI.64 load0A, [track0A];
+--:-:2:-:1  @P3 LDG.E.CI.64 load1A, [track1A];
+--:-:3:-:1  @P4 LDG.E.CI.64 load2A, [track2A];
+--:-:4:-:1  @P5 LDG.E.CI.64 load3A, [track3A];
+--:-:5:-:1  @P6 LDG.E.CI.64 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.64 load0A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 load1A, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.64 load2A, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.64 load3A, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.64 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:1:-:1  @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:1:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:1:-:1  @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];
+--:-:5:-:1  @P1 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];
+--:-:5:-:1  @P2 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P1, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+21:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:1:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+
+02:-:-:-:1      F2F.F32.F16 load1A3, load1A1.H1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A1.H0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A0.H1;
+--:-:2:-:1      F2F.F32.F16 load1A0, load1A0.H0;
+
+04:-:-:-:1      F2F.F32.F16 load2A3, load2A1.H1;
+--:-:-:-:1      F2F.F32.F16 load2A2, load2A1.H0;
+--:-:-:-:1      F2F.F32.F16 load2A1, load2A0.H1;
+--:-:3:-:1      F2F.F32.F16 load2A0, load2A0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load3A3, load3A1.H1;
+--:-:-:-:1      F2F.F32.F16 load3A2, load3A1.H0;
+--:-:-:-:1      F2F.F32.F16 load3A1, load3A0.H1;
+--:-:4:-:1      F2F.F32.F16 load3A0, load3A0.H0;
+
+10:-:-:-:1      F2F.F32.F16 loadB3, loadB1.H1;
+--:-:-:-:1      F2F.F32.F16 loadB2, loadB1.H0;
+--:-:-:-:1      F2F.F32.F16 loadB1, loadB0.H1;
+--:-:5:-:1      F2F.F32.F16 loadB0, loadB0.H0;
+    } : q{
+21:-:-:-:1      F2F.F32.F16 load0A0, load0A0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:1:-:1      F2F.F32.F16 load0A3, load0A3;
+
+02:-:-:-:1      F2F.F32.F16 load1A0, load1A0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A2;
+--:-:2:-:1      F2F.F32.F16 load1A3, load1A3;
+
+04:-:-:-:1      F2F.F32.F16 load2A0, load2A0;
+--:-:-:-:1      F2F.F32.F16 load2A1, load2A1;
+--:-:-:-:1      F2F.F32.F16 load2A2, load2A2;
+--:-:3:-:1      F2F.F32.F16 load2A3, load2A3;
+
+08:-:-:-:1      F2F.F32.F16 load3A0, load3A0;
+--:-:-:-:1      F2F.F32.F16 load3A1, load3A1;
+--:-:-:-:1      F2F.F32.F16 load3A2, load3A2;
+--:-:4:-:1      F2F.F32.F16 load3A3, load3A3;
+
+10:-:-:-:1      F2F.F32.F16 loadB0, loadB0;
+--:-:-:-:1      F2F.F32.F16 loadB1, loadB1;
+--:-:-:-:1      F2F.F32.F16 loadB2, loadB2;
+--:-:5:-:1      F2F.F32.F16 loadB3, loadB3;
+    };
+</CODE>
+
+01:-:-:-:1      STS [writeAs + 4x<0*128 + 0*32>], load0A0;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 0*32>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 0*32>], load0A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 0*32>], load0A3;
+
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128 + 1*32>], load1A0;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 1*32>], load1A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 1*32>], load1A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 1*32>], load1A3;
+
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 2*32>], load2A0;
+--:-:-:-:0      IADD   track2A0.CC, track2A0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 2*32>], load2A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 2*32>], load2A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 2*32>], load2A3;
+
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 3*32>], load3A0;
+--:-:-:-:0      IADD   track3A0.CC, track3A0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 3*32>], load3A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 3*32>], load3A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 3*32>], load3A3;
+
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.64 load0A, [track0A];
+--:-:4:-:1  @P3 LDG.E.CI.64 load1A, [track1A];
+--:-:5:-:1  @P4 LDG.E.CI.64 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.CI.64 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.64 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n",
+
+        j9c6   => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n",
+
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 2x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2A0.CC, track2A0, 2x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, 2x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.64 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.64 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI.64 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.64 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.64 loadB,  [trackB];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1A3, load1A1.H1;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A2, load1A1.H0;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A1, load1A0.H1;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1A0, load1A0.H0;\n",
+
+                j6c13  => "10:-:-:-:1  \@P4 F2F.F32.F16 load2A3, load2A1.H1;\n",
+                j6c17  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2A2, load2A1.H0;\n",
+                j6c21  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2A1, load2A0.H1;\n",
+                j6c25  => "--:-:5:-:1  \@P4 F2F.F32.F16 load2A0, load2A0.H0;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A3, load3A1.H1;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A2, load3A1.H0;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A1, load3A0.H1;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3A0, load3A0.H0;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];\n",
+                j10c3  => "--:-:5:-:1  \@P4 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0A0, load0A0;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A1, load0A1;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A2, load0A2;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0A3, load0A3;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1A0, load1A0;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A1, load1A1;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A2, load1A2;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1A3, load1A3;\n",
+
+                j6c13  => "10:-:-:-:1  \@P4 F2F.F32.F16 load2A0, load2A0;\n",
+                j6c17  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2A1, load2A1;\n",
+                j6c21  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2A2, load2A2;\n",
+                j6c25  => "--:-:5:-:1  \@P4 F2F.F32.F16 load2A3, load2A3;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A0, load3A0;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A1, load3A1;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A2, load3A2;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3A3, load3A3;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB3;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB2;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB0, loadB0;\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="hgemm_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_128x64.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_128x64.sass
new file mode 100644
index 0000000..8e6c457
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nn_128x64.sass
@@ -0,0 +1,438 @@
+# Kernel: hgemm_nn_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tid, blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, txa, txa1, ta, xmad_ta, tb, xmad_tb, tidAY, tidBY, k<1-3>, x<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-111  : loadA<0-7>, loadAA<0-3>, loadB<0-3>
+
+    112-117 : track0A<0-1>, track1A<0-1>, trackB<0-1>
+
+    118-122 ~ writeAs, writeBs, k, txb, swapBuf
+    123-127 : readAs, readBs
+
+    64-83   ~ ldc, ldcz, ci, xmad_c, threadId, tid31, tid96, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C04y<0-1>, C08y<0-1>, C12y<0-1>, C00y<0-1>
+    86-107  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,   param_k;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tidAX = tid & 0xfe
+// tidAY = (tid & 1) << 2
+01:-:-:-:1      LOP.AND tidAX, tid, 0xfe;
+--:-:-:-:1      LOP.AND tid1,  tid,  1;
+--:-:-:-:1      SHL     tidAY, tid1, 2;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD  txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO ta, lda, txa, tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta,  param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta,  param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track1A0.CC, lda, track0A0,       1;
+--:-:-:-:1      LEA.HI.X track1A1,    lda, track0A1, RZ,   1;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa,  param_m, PT;
+--:-:-:-:1      IADD txa1, txa, 1;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa1, param_m, PT;
+
+// tidBX = (tid & 15) << 2
+// tidBY = (tid >> 4) & 7
+--:-:-:-:1      LOP.AND tidBX, tid,  15;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,  0x304; // 3 bits at position 4
+
+// trackB += (blkB*64 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     1;
+--:-:-:-:2      LEA.HI.X trackB1,    tb, param_B[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// Start the write buffers high
+// writeAs = (128*tidAY + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2;
+// writeBs = (64*tidBY + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2;
+
+// Start the read buffers low
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<64*8 + 128*8>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:6:-:1  @P6 LDG.E.CI.64 loadB0,  [trackB];
+
+--:-:2:-:1  @P5 LDG.E.CI.64 loadA2,  [track1A + 2x<0>];
+--:-:2:-:1  @P5 LDG.E.CI.64 loadAA2, [track1A + 2x<8>];
+
+--:-:3:-:1  @P4 LDG.E.CI.64 loadA0,  [track0A + 2x<0>];
+--:-:3:-:1  @P4 LDG.E.CI.64 loadAA0, [track0A + 2x<8>];
+
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+--:-:4:-:1 @!P6 LDS.U.64 loadB0,  [addr_zero];
+--:-:5:-:1 @!P5 LDS.U.64 loadA2,  [addr_zero];
+--:-:5:-:1 @!P4 LDS.U.64 loadA0,  [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.64 loadAA2, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.64 loadAA0, [addr_zero];
+    } : q{
+
+--:-:2:-:2      S2R tid,  SR_TID.X;
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1      LOP.AND tidAY, tid, 1;
+--:-:-:-:1      SHL     tidAY, tidAY, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4
+
+// doLoad0 = tidBY < k
+--:-:-:-:1      IADD x1, txb, 1;
+--:-:-:-:1      IADD x2, txb, 2;
+--:-:-:-:1      IADD x3, txb, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_n, P0;
+
+--:-:6:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];
+--:-:6:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];
+--:-:6:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];
+--:-:6:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+
+--:-:-:-:1      IADD k1, tidAY, 1;
+--:-:-:-:1      IADD k2, tidAY, 2;
+--:-:-:-:1      IADD k3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P4;
+
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [track0A + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA2, [track0A + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA4, [track0A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA6, [track0A + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA2, RZ;
+--:-:-:-:1 @!P2 MOV loadA4, RZ;
+--:-:-:-:1 @!P3 MOV loadA6, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:3:-:1  @P0 LDG.E.CI.S16 loadA1, [track1A + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.S16 loadA3, [track1A + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.S16 loadA5, [track1A + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.S16 loadA7, [track1A + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA1, RZ;
+--:-:-:-:1 @!P1 MOV loadA3, RZ;
+--:-:-:-:1 @!P2 MOV loadA5, RZ;
+--:-:-:-:1 @!P3 MOV loadA7, RZ;
+</SCHEDULE_BLOCK>
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+28:-:-:-:4      F2F.F32.F16 loadB3, loadB1.H1;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      F2F.F32.F16 loadB2, loadB1.H0;
+--:-:-:-:4      F2F.F32.F16 loadB1, loadB0.H1;
+--:-:4:-:2      F2F.F32.F16 loadB0, loadB0.H0;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs], loadB0;
+
+12:-:-:-:4      F2F.F32.F16 loadA7, loadA3.H1;
+04:-:2:-:4      F2F.F32.F16 loadA6, loadA1.H1;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 2x<16>;
+--:-:-:-:4      F2F.F32.F16 loadA5, loadA3.H0;
+--:-:3:-:4      F2F.F32.F16 loadA4, loadA1.H0;
+--:-:-:-:0      IADD.X track0A1, track0A1, RZ;
+--:-:-:-:4      F2F.F32.F16 loadA3, loadA2.H1;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 2x<16>;
+--:-:-:-:4      F2F.F32.F16 loadA1, loadA2.H0;
+--:-:4:-:4      F2F.F32.F16 loadA2, loadA0.H1;
+--:-:5:-:1      F2F.F32.F16 loadA0, loadA0.H0;
+
+--:-:-:-:0      IADD.X track1A1, track1A1, RZ;
+
+02:-:-:-:1      STS.64 [writeAs + 4x<3*128>], loadA6;
+04:-:-:-:1      STS.64 [writeAs + 4x<2*128>], loadA4;
+08:-:-:-:1      STS.64 [writeAs + 4x<1*128>], loadA2;
+10:-:-:-:1      STS.64 [writeAs + 4x<0*128>], loadA0;
+
+    } : q{
+
+20:-:-:-:4      F2F.F32.F16 loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      F2F.F32.F16 loadB1, loadB1;
+--:-:-:-:4      F2F.F32.F16 loadB2, loadB2;
+--:-:6:-:2      F2F.F32.F16 loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+20:-:-:-:1      STS.128 [writeBs], loadB0;
+
+02:-:-:-:4      F2F.F32.F16 loadA0, loadA0;
+04:-:2:-:4      F2F.F32.F16 loadA1, loadA1;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 2x<8>;
+--:-:-:-:4      F2F.F32.F16 loadA2, loadA2;
+--:-:3:-:4      F2F.F32.F16 loadA3, loadA3;
+--:-:-:-:0      IADD.X track0A1, track0A1, RZ;
+--:-:-:-:4      F2F.F32.F16 loadA4, loadA4;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 2x<8>;
+--:-:4:-:4      F2F.F32.F16 loadA5, loadA5;
+--:-:-:-:4      F2F.F32.F16 loadA6, loadA6;
+--:-:-:-:0      IADD.X track1A1, track1A1, RZ;
+--:-:5:-:1      F2F.F32.F16 loadA7, loadA7;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+02:-:-:-:1      STS.64 [writeAs + 4x<0*128>], loadA0;
+04:-:-:-:1      STS.64 [writeAs + 4x<1*128>], loadA2;
+08:-:-:-:1      STS.64 [writeAs + 4x<2*128>], loadA4;
+10:-:-:-:1      STS.64 [writeAs + 4x<3*128>], loadA6;
+    };
+</CODE>
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND  P0, PT, k, $k_end, PT;\n",
+        j0c15 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P5;\n",
+
+        j0c10 => "--:-:2:-:1  \@P3 LDG.E.CI.64 loadB0, [trackB];\n",
+
+        j0c28 => "--:-:4:-:1  \@P2 LDG.E.CI.64 loadA2,  [track1A + 2x<0>];\n",
+        j0c30 => "--:-:4:-:1  \@P2 LDG.E.CI.64 loadAA2, [track1A + 2x<8>];\n",
+
+        j0c31 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P4;\n",
+
+        j0c44 => "--:-:5:-:1  \@P2 LDG.E.CI.64 loadA0,  [track0A + 2x<0>];\n",
+        j0c46 => "--:-:6:-:1  \@P2 LDG.E.CI.64 loadAA0, [track0A + 2x<8>];\n",
+
+        j3c53 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA7, loadAA3.H1;\n",
+        j3c57 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA6, loadAA1.H1;\n",
+        j3c61 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA5, loadAA3.H0;\n",
+        j4c1  => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA4, loadAA1.H0;\n",
+        j4c5  => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA3, loadAA2.H1;\n",
+        j4c9  => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA1, loadAA2.H0;\n",
+        j4c13 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA2, loadAA0.H1;\n",
+        j4c17 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA0, loadAA0.H0;\n",
+
+        j5c5  => "02:-:-:-:1  \@P3 F2F.F32.F16 loadB3, loadB1.H1;\n",
+        j5c9  => "--:-:-:-:1  \@P3 F2F.F32.F16 loadB2, loadB1.H0;\n",
+        j5c13 => "--:-:-:-:1  \@P3 F2F.F32.F16 loadB1, loadB0.H1;\n",
+        j5c17 => "--:-:2:-:1  \@P3 F2F.F32.F16 loadB0, loadB0.H0;\n",
+
+        j5c35 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j5c53 => "08:-:-:-:1  \@P1 F2F.F32.F16 loadA7, loadA3.H1;\n",
+        j5c57 => "10:-:2:-:1  \@P1 F2F.F32.F16 loadA6, loadA1.H1;\n",
+        j5c61 => "--:-:-:-:1  \@P1 F2F.F32.F16 loadA5, loadA3.H0;\n",
+        j6c1  => "--:-:3:-:1  \@P1 F2F.F32.F16 loadA4, loadA1.H0;\n",
+        j6c5  => "--:-:-:-:1  \@P1 F2F.F32.F16 loadA3, loadA2.H1;\n",
+        j6c9  => "--:-:-:-:1  \@P1 F2F.F32.F16 loadA1, loadA2.H0;\n",
+        j6c13 => "--:-:4:-:1  \@P1 F2F.F32.F16 loadA2, loadA0.H1;\n",
+        j6c17 => "--:-:5:-:1  \@P1 F2F.F32.F16 loadA0, loadA0.H0;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS.64 [writeAs + 4x<3*128>], loadA6;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS.64 [writeAs + 4x<2*128>], loadA4;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS.64 [writeAs + 4x<1*128>], loadA2;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS.64 [writeAs + 4x<0*128>], loadA0;\n",
+
+        j6c46 => "--:-:-:-:1  \@P1 IADD   track1A0.CC, track1A0, 2x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P1 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c55 => "20:-:-:-:1  \@P1 IADD   track0A0.CC, track0A0, 2x<16>;\n",
+        j7c61 => "--:-:-:-:1  \@P1 IADD.X track0A1,    track0A1, RZ;\n",
+
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P4;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        j0c10 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j0c12 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j0c14 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j0c16 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j0c33 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA0, [track0A + 2x<0>];\n",
+        j0c35 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA2, [track0A + 2x<1>];\n",
+        j0c37 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA4, [track0A + 2x<2>];\n",
+        j0c39 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA6, [track0A + 2x<3>];\n",
+
+        j0c41 => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+
+        j1c29 => "--:-:3:-:1  \@P2 LDG.E.CI.S16 loadA1, [track1A + 2x<0>];\n",
+        j1c31 => "--:-:3:-:1  \@P2 LDG.E.CI.S16 loadA3, [track1A + 2x<1>];\n",
+        j1c33 => "--:-:3:-:1  \@P2 LDG.E.CI.S16 loadA5, [track1A + 2x<2>];\n",
+        j1c35 => "--:-:3:-:1  \@P2 LDG.E.CI.S16 loadA7, [track1A + 2x<3>];\n",
+
+        j5c8  => "20:-:-:-:1  \@P3 F2F.F32.F16 loadB0, loadB0;\n",
+        j5c12 => "--:-:-:-:1  \@P3 F2F.F32.F16 loadB1, loadB1;\n",
+        j5c16 => "--:-:-:-:1  \@P3 F2F.F32.F16 loadB2, loadB2;\n",
+        j5c20 => "--:-:6:-:1  \@P3 F2F.F32.F16 loadB3, loadB3;\n",
+
+        j5c39 => "20:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j5c53 => "02:-:-:-:1  \@P0 F2F.F32.F16 loadA0, loadA0;\n",
+        j5c57 => "04:-:2:-:1  \@P0 F2F.F32.F16 loadA1, loadA1;\n",
+        j5c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 loadA2, loadA2;\n",
+        j6c1  => "--:-:3:-:1  \@P0 F2F.F32.F16 loadA3, loadA3;\n",
+        j6c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 loadA4, loadA4;\n",
+        j6c9  => "--:-:4:-:1  \@P0 F2F.F32.F16 loadA5, loadA5;\n",
+        j6c13 => "--:-:-:-:1  \@P0 F2F.F32.F16 loadA6, loadA6;\n",
+        j6c17 => "--:-:5:-:1  \@P0 F2F.F32.F16 loadA7, loadA7;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS.64 [writeAs + 4x<0*128>], loadA0;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS.64 [writeAs + 4x<1*128>], loadA2;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS.64 [writeAs + 4x<2*128>], loadA4;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS.64 [writeAs + 4x<3*128>], loadA6;\n",
+
+        j6c46 => "--:-:-:-:1  \@P0 IADD   track0A0.CC, track0A0, 2x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P0 IADD.X track0A1,    track0A1, RZ;\n",
+        j6c55 => "--:-:-:-:1  \@P0 IADD   track1A0.CC, track1A0, 2x<8>;\n",
+        j6c61 => "--:-:-:-:1  \@P0 IADD.X track1A1,    track1A1, RZ;\n",
+            )
+        ),
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:0      IADD32I k, k, -8;\n" .
+                 "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n") :
+            (j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n")
+        ),
+    );
+    return;
+</CODE>
+
+<INCLUDE file="hgemm_common_128x64.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_16x64.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_16x64.sass
new file mode 100644
index 0000000..1dfb949
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nn_16x64.sass
@@ -0,0 +1,1171 @@
+# Kernel: hgemm_nn_16x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(16*64 + 32)*2 + 64*64*2>
+    szShareA   : (16*64 + 32)
+    szShareB   : (64*64)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+      64-79 : j0Ay<0-7>, j0Bx<0-7>
+      80-95 : j1Ay<0-7>, j1Bx<0-7>
+
+      64-95 ~ lda, ldb, ldb8, tidAX, tidAY, tidBX, tidBY, tidAY<1-3>, tidBY<8|16|24|32|40|48|56>, tid16_8, tb, shiftAX, partialK, partialB, ldaz, ldbz, ta, txa, txb, txb<1-3>, xmad_ta
+
+     96-135 :  load0A<0-7>,  load0B<0-3>,  load1B<0-3>,  load2B<0-3>,  load3B<0-3>,  load4B<0-3>,  load5B<0-3>,  load6B<0-3>,  load7B<0-3>
+    136-153 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>, track4B<0-1>, track5B<0-1>, track6B<0-1>, track7B<0-1>
+
+    154-161 ~ swapBuf, readAs, readBs, writeAs, writeBs, k, ldb64
+    162-171 ~ tid, blkA, blkB, blkZ, writeCs, preds, tid16
+
+       0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3>
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+      96-99 : loadC<0-3>
+    100-103 : b<0-3>
+    104-107 : c<0-3>
+    108-109 : C<0-1>
+    110-161 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb8,  ldb, 3;
+--:-:-:-:1      SHL ldb64, ldb, 7;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// tidAX   = tid >> 3
+// tidAY   = (tid & 7) << 3
+// shiftAX = (tid & 7) << 2
+01:-:-:-:1      SHR.U32 tidAX,   tid,   3;
+--:-:-:-:1      LOP.AND tidAY,   tid,   7;
+--:-:-:-:1      SHL     shiftAX, tidAY, 2;
+--:-:-:-:1      SHL     tidAY,   tidAY, 3;
+
+// tidBX   = (tid & 15) << 2
+// tidBY   = tid >> 4
+01:-:-:-:1      LOP.AND tidBX, tid,  15;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+01:-:-:-:1      SHR.U32 tidBY, tid,   4;
+
+--:-:-:-:1      IADD tidBY8,  tidBY, 8;
+--:-:-:-:1      IADD tidBY16, tidBY, 16;
+--:-:-:-:1      IADD tidBY24, tidBY, 24;
+--:-:-:-:1      IADD tidBY32, tidBY, 32;
+--:-:-:-:1      IADD tidBY40, tidBY, 40;
+--:-:-:-:1      IADD tidBY48, tidBY, 48;
+--:-:-:-:1      IADD tidBY56, tidBY, 56;
+
+// trackA += ((blkA*16 + tidAX) * lda + tidAY) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 4;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa, tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa, param_m, PT;
+
+// trackB += (blkB*64 + tidBX + ldb*tidBY) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
+--:-:-:-:1      XMAD.LO2 tb, ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb, ldbz, blkZ,  tb;
+
+--:-:-:-:1      LEA      track0B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track1B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track2B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track3B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track4B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track4B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track5B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track5B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track6B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track6B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track7B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track7B1,    tb, param_B[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb, param_n, PT;
+[+
+    our $vec;
+    return $vec ? '' : q{
+--:-:-:-:1      IADD txb1, txb, 1;
+--:-:-:-:1      IADD txb2, txb, 2;
+--:-:-:-:1      IADD txb3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb3, param_n, PT;
+    };
++]
+--:-:-:-:1      P2R preds, PR, RZ, 0x7c;
+
+// writeAs = (tidAY*16 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 4;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (tidBY*64 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA>, 2;
+
+// readAs = (tid & 1) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHL     readBs, readBs, 4;
+
+// tid16 = tid & -16
+// tid16_8 = tid16 / 2 * 4
+--:-:-:-:1      LOP.AND tid16,   tid,  -16;
+--:-:-:-:1      SHL     tid16_8, tid16, 1;
+
+// writeCs = (readAs + tid16*2) * 64 + readBs;
+--:-:-:-:1      ISCADD writeCs, tid16,   readAs, 1;
+--:-:-:-:1      ISCADD writeCs, writeCs, readBs, 6;
+
+// Each block of 16 threads works on 8 lines, readAs is also shifted over by 4
+// readAs += tid16_8 * 16 + tid16
+// readBs += tid16_8 * 64 + 4x<szShareA>
+--:-:-:-:1      ISCADD readAs, tid16_8, readAs, 4;
+--:-:-:-:1      ISCADD readBs, tid16_8, readBs, 6;
+--:-:-:-:1      IADD   readAs, tid16, readAs;
+--:-:-:-:1      IADD   readBs, readBs, 4x<szShareA>;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareA + szShareB>;
+
+// If k is not a multiple of 64 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 64 then make a full 64 line fetch.
+--:-:-:-:1      LOP.AND.Z P0, partialK, k, 63;
+--:-:-:-:1  @P0 MOV partialK, 64;
+--:-:-:-:1      IADD k, k, -partialK;
+[+
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY,   partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY,   partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY8,  partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY16, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY24, partialK, P3;
+<ORDERED>
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P1 LDG.E.CI.64  load0B, [track0B];
+--:-:3:-:1  @P4 LDG.E.CI.64  load1B, [track1B];
+--:-:4:-:1  @P5 LDG.E.CI.64  load2B, [track2B];
+--:-:4:-:1  @P6 LDG.E.CI.64  load3B, [track3B];
+</ORDERED>
+<ORDERED>
+--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:-:-:1 @!P1 LDS.U.64  load0B, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.64  load1B, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.64  load2B, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.64  load3B, [addr_zero];
+</ORDERED>
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY32, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY40, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY48, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY56, partialK, P3;
+<ORDERED>
+--:-:5:-:1  @P1 LDG.E.CI.64  load4B, [track4B];
+--:-:5:-:1  @P4 LDG.E.CI.64  load5B, [track5B];
+--:-:6:-:1  @P5 LDG.E.CI.64  load6B, [track6B];
+--:-:6:-:1  @P6 LDG.E.CI.64  load7B, [track7B];
+</ORDERED>
+<ORDERED>
+--:-:-:-:1 @!P1 LDS.U.64  load4B, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.64  load5B, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.64  load6B, [addr_zero];
+--:-:1:-:1 @!P6 LDS.U.64  load7B, [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:2:-:1  @P6 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0A0, RZ;
+--:-:-:-:1 @!P4 MOV load0A1, RZ;
+--:-:-:-:1 @!P5 MOV load0A2, RZ;
+--:-:-:-:1 @!P6 MOV load0A3, RZ;
+
+--:-:-:-:1      IADD tidAY,  tidAY,  4;
+--:-:-:-:1      IADD tidAY1, tidAY1, 4;
+--:-:-:-:1      IADD tidAY2, tidAY2, 4;
+--:-:-:-:1      IADD tidAY3, tidAY3, 4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P6 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0A4, RZ;
+--:-:-:-:1 @!P4 MOV load0A5, RZ;
+--:-:-:-:1 @!P5 MOV load0A6, RZ;
+--:-:-:-:1 @!P6 MOV load0A7, RZ;
+
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0B0, RZ;
+--:-:-:-:1 @!P4 MOV load0B1, RZ;
+--:-:-:-:1 @!P5 MOV load0B2, RZ;
+--:-:-:-:1 @!P6 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY8, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P1 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load1B0, RZ;
+--:-:-:-:1 @!P4 MOV load1B1, RZ;
+--:-:-:-:1 @!P5 MOV load1B2, RZ;
+--:-:-:-:1 @!P6 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY16, partialK, PT;
+--:-:-:-:1  @P2 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P2 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load2B0, RZ;
+--:-:-:-:1 @!P4 MOV load2B1, RZ;
+--:-:-:-:1 @!P5 MOV load2B2, RZ;
+--:-:-:-:1 @!P6 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY24, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load3B0, RZ;
+--:-:-:-:1 @!P4 MOV load3B1, RZ;
+--:-:-:-:1 @!P5 MOV load3B2, RZ;
+--:-:-:-:1 @!P6 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY32, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P1 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load4B0, RZ;
+--:-:-:-:1 @!P4 MOV load4B1, RZ;
+--:-:-:-:1 @!P5 MOV load4B2, RZ;
+--:-:-:-:1 @!P6 MOV load4B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY40, partialK, PT;
+--:-:-:-:1  @P2 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P2 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load5B0, RZ;
+--:-:-:-:1 @!P4 MOV load5B1, RZ;
+--:-:-:-:1 @!P5 MOV load5B2, RZ;
+--:-:-:-:1 @!P6 MOV load5B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY48, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load6B0, RZ;
+--:-:-:-:1 @!P4 MOV load6B1, RZ;
+--:-:-:-:1 @!P5 MOV load6B2, RZ;
+--:-:-:-:1 @!P6 MOV load6B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY56, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P1 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load7B0, RZ;
+--:-:-:-:1 @!P4 MOV load7B1, RZ;
+--:-:-:-:1 @!P5 MOV load7B2, RZ;
+--:-:-:-:1 @!P6 MOV load7B3, RZ;
+    };
++]
+// partialB = partialK * ldb
+--:-:-:-:1      XMAD.LO2 partialB, ldb, partialK, RZ;
+
+--:-:-:-:1      ISETP.GE.AND P1, PT, k, 64, PT;
+--:-:-:-:1      IADD k, k, -64;
+--:-:-:-:1  @P1 R2P PR, preds, 0x7c;
+--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c;
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:1      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+    } : q{
+02:-:-:-:1      F2F.F32.F16 load0A7, load0A7;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A6;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A5;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A4;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A3;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0;
+    };
++]
+--:-:-:-:0      LEA track0A0.CC, partialK, track0A0, 1;
+01:-:-:-:1      STS [writeAs + 4x<7*16>], load0A7;
+--:-:-:-:1      STS [writeAs + 4x<6*16>], load0A6;
+--:-:-:-:1      STS [writeAs + 4x<5*16>], load0A5;
+--:-:-:-:1      STS [writeAs + 4x<4*16>], load0A4;
+02:-:-:-:1      STS [writeAs + 4x<3*16>], load0A3;
+--:-:-:-:1      STS [writeAs + 4x<2*16>], load0A2;
+--:-:-:-:1      STS [writeAs + 4x<1*16>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<0*16>], load0A0;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+04:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:1:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+    } : q{
+04:-:-:-:1      F2F.F32.F16 load0B0, load0B0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:1:-:1      F2F.F32.F16 load0B3, load0B3;
+--:-:-:-:1      F2F.F32.F16 load1B0, load1B0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:2:-:1      F2F.F32.F16 load1B3, load1B3;
+    };
++]
+--:-:-:-:0      LEA track0B0.CC, partialB, track0B0, 1;
+01:-:-:-:6      STS.128 [writeBs + 4x<0*64>], load0B;
+--:-:-:-:1      IADD.X track0B1, track0B1, RZ;
+
+--:-:-:-:0      LEA track1B0.CC, partialB, track1B0, 1;
+02:-:-:-:6      STS.128 [writeBs + 4x<8*64>], load1B;
+--:-:-:-:0      IADD.X track1B1, track1B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+08:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:1:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+--:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:2:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+    } : q{
+08:-:-:-:1      F2F.F32.F16 load2B0, load2B0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:1:-:1      F2F.F32.F16 load2B3, load2B3;
+--:-:-:-:1      F2F.F32.F16 load3B0, load3B0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:2:-:1      F2F.F32.F16 load3B3, load3B3;
+    };
++]
+--:-:-:-:0      LEA track2B0.CC, partialB, track2B0, 1;
+01:-:-:-:6      STS.128 [writeBs + 4x<16*64>], load2B;
+--:-:-:-:1      IADD.X track2B1, track2B1, RZ;
+
+--:-:-:-:0      LEA track3B0.CC, partialB, track3B0, 1;
+02:-:-:-:6      STS.128 [writeBs + 4x<24*64>], load3B;
+--:-:-:-:0      IADD.X track3B1, track3B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+10:-:-:-:1      F2F.F32.F16 load4B3, load4B1.H1;
+--:-:-:-:1      F2F.F32.F16 load4B2, load4B1.H0;
+--:-:-:-:1      F2F.F32.F16 load4B1, load4B0.H1;
+--:-:1:-:1      F2F.F32.F16 load4B0, load4B0.H0;
+--:-:-:-:1      F2F.F32.F16 load5B3, load5B1.H1;
+--:-:-:-:1      F2F.F32.F16 load5B2, load5B1.H0;
+--:-:-:-:1      F2F.F32.F16 load5B1, load5B0.H1;
+--:-:2:-:1      F2F.F32.F16 load5B0, load5B0.H0;
+    } : q{
+10:-:-:-:1      F2F.F32.F16 load4B0, load4B0;
+--:-:-:-:1      F2F.F32.F16 load4B1, load4B1;
+--:-:-:-:1      F2F.F32.F16 load4B2, load4B2;
+--:-:1:-:1      F2F.F32.F16 load4B3, load4B3;
+--:-:-:-:1      F2F.F32.F16 load5B0, load5B0;
+--:-:-:-:1      F2F.F32.F16 load5B1, load5B1;
+--:-:-:-:1      F2F.F32.F16 load5B2, load5B2;
+--:-:2:-:1      F2F.F32.F16 load5B3, load5B3;
+    };
++]
+--:-:-:-:0      LEA track4B0.CC, partialB, track4B0, 1;
+01:-:-:-:6      STS.128 [writeBs + 4x<32*64>], load4B;
+--:-:-:-:1      IADD.X track4B1, track4B1, RZ;
+
+--:-:-:-:0      LEA track5B0.CC, partialB, track5B0, 1;
+02:-:-:-:6      STS.128 [writeBs + 4x<40*64>], load5B;
+--:-:-:-:0      IADD.X track5B1, track5B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+20:-:-:-:1      F2F.F32.F16 load6B3, load6B1.H1;
+--:-:-:-:1      F2F.F32.F16 load6B2, load6B1.H0;
+--:-:-:-:1      F2F.F32.F16 load6B1, load6B0.H1;
+--:-:1:-:1      F2F.F32.F16 load6B0, load6B0.H0;
+--:-:-:-:1      F2F.F32.F16 load7B3, load7B1.H1;
+--:-:-:-:1      F2F.F32.F16 load7B2, load7B1.H0;
+--:-:-:-:1      F2F.F32.F16 load7B1, load7B0.H1;
+--:-:2:-:1      F2F.F32.F16 load7B0, load7B0.H0;
+    } : q{
+20:-:-:-:1      F2F.F32.F16 load6B0, load6B0;
+--:-:-:-:1      F2F.F32.F16 load6B1, load6B1;
+--:-:-:-:1      F2F.F32.F16 load6B2, load6B2;
+--:-:1:-:1      F2F.F32.F16 load6B3, load6B3;
+--:-:-:-:1      F2F.F32.F16 load7B0, load7B0;
+--:-:-:-:1      F2F.F32.F16 load7B1, load7B1;
+--:-:-:-:1      F2F.F32.F16 load7B2, load7B2;
+--:-:2:-:1      F2F.F32.F16 load7B3, load7B3;
+    };
++]
+--:-:-:-:0      LEA track6B0.CC, partialB, track6B0, 1;
+01:-:-:-:6      STS.128 [writeBs + 4x<48*64>], load6B;
+--:-:-:-:1      IADD.X track6B1, track6B1, RZ;
+
+--:-:-:-:0      LEA track7B0.CC, partialB, track7B0, 1;
+02:-:-:-:6      STS.128 [writeBs + 4x<56*64>], load7B;
+--:-:-:-:0      IADD.X track7B1, track7B1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*16 + 00>];
+--:-:-:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*16 + 08>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>];
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.64  load0B, [track0B];
+--:-:3:-:1  @P3 LDG.E.CI.64  load1B, [track1B];
+--:-:4:-:1  @P3 LDG.E.CI.64  load2B, [track2B];
+--:-:4:-:1  @P3 LDG.E.CI.64  load3B, [track3B];
+--:-:5:-:1  @P3 LDG.E.CI.64  load4B, [track4B];
+--:-:5:-:1  @P3 LDG.E.CI.64  load5B, [track5B];
+--:-:6:-:1  @P3 LDG.E.CI.64  load6B, [track6B];
+--:-:6:-:1  @P3 LDG.E.CI.64  load7B, [track7B];
+    } : q{
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>];
+    };
++]
+
+LOOP:
+
+[+
+    our $vec;
+    our %insert =
+    (
+        j0c8   => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, RZ, PT;\n",
+        j0c10  => "--:-:-:-:1      ISETP.GE.AND P1, PT, k, 64, PT;\n" .
+                  "--:-:-:-:1      IADD k, k, -64;\n",
+
+        j0c23  => "--:-:-:-:1  \@P1 R2P PR, preds, 0x7c;\n",
+        j0c24  => "--:-:-:-:1 \@!P1 R2P PR, RZ,    0x7c;\n",
+
+        j2c32  => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<64>;\n",
+        j2c37  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+
+        j3c25  => "--:-:-:-:1  \@P3 IADD   track0B0.CC, track0B0, ldb64;\n",
+        j3c30  => "--:-:-:-:1  \@P3 IADD.X track0B1,    track0B1, RZ;\n",
+        j3c32  => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb64;\n",
+        j3c37  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+
+        j4c25  => "--:-:-:-:1  \@P3 IADD   track2B0.CC, track2B0, ldb64;\n",
+        j4c30  => "--:-:-:-:1  \@P3 IADD.X track2B1,    track2B1, RZ;\n",
+        j4c32  => "--:-:-:-:1  \@P3 IADD   track3B0.CC, track3B0, ldb64;\n",
+        j4c37  => "--:-:-:-:1  \@P3 IADD.X track3B1,    track3B1, RZ;\n",
+
+        j5c25  => "--:-:-:-:1  \@P3 IADD   track4B0.CC, track4B0, ldb64;\n",
+        j5c30  => "--:-:-:-:1  \@P3 IADD.X track4B1,    track4B1, RZ;\n",
+        j5c32  => "--:-:-:-:1  \@P3 IADD   track5B0.CC, track5B0, ldb64;\n",
+        j5c37  => "--:-:-:-:1  \@P3 IADD.X track5B1,    track5B1, RZ;\n",
+
+        j6c25  => "--:-:-:-:1  \@P3 IADD   track6B0.CC, track6B0, ldb64;\n",
+        j6c30  => "--:-:-:-:1  \@P3 IADD.X track6B1,    track6B1, RZ;\n",
+        j6c32  => "--:-:-:-:1  \@P3 IADD   track7B0.CC, track7B0, ldb64;\n",
+        j6c37  => "--:-:-:-:1  \@P3 IADD.X track7B1,    track7B1, RZ;\n",
+
+        j6c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j2c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<7*16>], load0A7;\n",
+        j2c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*16>], load0A6;\n",
+        j2c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*16>], load0A5;\n",
+        j2c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*16>], load0A4;\n",
+        j2c24  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*16>], load0A3;\n",
+        j2c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*16>], load0A2;\n",
+        j2c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*16>], load0A1;\n",
+        j2c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<0*16>], load0A0;\n",
+
+        j3c16  => "04:-:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*64>], load0B;\n",
+        j3c20  => "--:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*64>], load1B;\n",
+
+        j4c16  => "08:-:-:-:1  \@P0 STS.128 [writeBs + 4x<16*64>], load2B;\n",
+        j4c20  => "--:4:-:-:1  \@P0 STS.128 [writeBs + 4x<24*64>], load3B;\n",
+
+        j5c16  => "10:-:-:-:1  \@P0 STS.128 [writeBs + 4x<32*64>], load4B;\n",
+        j5c20  => "--:5:-:-:1  \@P0 STS.128 [writeBs + 4x<40*64>], load5B;\n",
+
+        j6c16  => "20:-:-:-:1  \@P0 STS.128 [writeBs + 4x<48*64>], load6B;\n",
+        j6c20  => "--:6:-:-:1  \@P0 STS.128 [writeBs + 4x<56*64>], load7B;\n",
+
+        ($vec ?
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j1c63 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j2c36 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n",
+                j2c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j3c36 => "08:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j3c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j3c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j3c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n",
+                j3c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j4c36 => "10:-:-:-:1  \@P0 F2F.F32.F16 load4B3, load4B1.H1;\n",
+                j4c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B2, load4B1.H0;\n",
+                j4c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B1, load4B0.H1;\n",
+                j4c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B0, load4B0.H0;\n",
+                j4c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B3, load5B1.H1;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B2, load5B1.H0;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B1, load5B0.H1;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load5B0, load5B0.H0;\n",
+
+                j5c36 => "20:-:-:-:1  \@P0 F2F.F32.F16 load6B3, load6B1.H1;\n",
+                j5c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B2, load6B1.H0;\n",
+                j5c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B1, load6B0.H1;\n",
+                j5c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B0, load6B0.H0;\n",
+                j5c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B3, load7B1.H1;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B2, load7B1.H0;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B1, load7B0.H1;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load7B0, load7B0.H0;\n",
+
+                j2c61 => "02:-:2:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j3c60 => "04:-:-:-:1  \@P3 LDG.E.CI.64  load0B, [track0B];\n",
+                j3c62 => "--:-:3:-:1  \@P3 LDG.E.CI.64  load1B, [track1B];\n",
+                j4c60 => "08:-:-:-:1  \@P3 LDG.E.CI.64  load2B, [track2B];\n",
+                j4c62 => "--:-:4:-:1  \@P3 LDG.E.CI.64  load3B, [track3B];\n",
+                j5c60 => "10:-:-:-:1  \@P3 LDG.E.CI.64  load4B, [track4B];\n",
+                j5c62 => "--:-:5:-:1  \@P3 LDG.E.CI.64  load5B, [track5B];\n",
+                j6c60 => "20:-:-:-:1  \@P3 LDG.E.CI.64  load6B, [track6B];\n",
+                j6c62 => "--:-:6:-:1  \@P3 LDG.E.CI.64  load7B, [track7B];\n",
+            ) :
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A0, load0A0;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A1;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A2;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A3;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A4;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A5;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A6;\n",
+                j1c63 => "--:2:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A7;\n",
+
+                j2c36 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B3;\n",
+                j2c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B0, load1B0;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B1;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B2;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load1B3, load1B3;\n",
+
+                j3c36 => "08:-:-:-:1  \@P0 F2F.F32.F16 load2B0, load2B0;\n",
+                j3c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B1;\n",
+                j3c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B2;\n",
+                j3c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B3;\n",
+                j3c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B0, load3B0;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B1;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B2;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load3B3, load3B3;\n",
+
+                j4c36 => "10:-:-:-:1  \@P0 F2F.F32.F16 load4B0, load4B0;\n",
+                j4c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B1, load4B1;\n",
+                j4c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B2, load4B2;\n",
+                j4c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B3, load4B3;\n",
+                j4c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B0, load5B0;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B1, load5B1;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B2, load5B2;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load5B3, load5B3;\n",
+
+                j5c36 => "20:-:-:-:1  \@P0 F2F.F32.F16 load6B0, load6B0;\n",
+                j5c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B1, load6B1;\n",
+                j5c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B2, load6B2;\n",
+                j5c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B3, load6B3;\n",
+                j5c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B0, load7B0;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B1, load7B1;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B2, load7B2;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load7B3, load7B3;\n",
+
+                j2c48 => "02:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j2c50 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j2c52 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j2c54 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+                j2c56 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n",
+                j2c58 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n",
+                j2c60 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n",
+                j2c62 => "--:-:2:-:1  \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n",
+
+                j3c48 => "04:-:-:-:1  \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j3c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j3c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+                j3c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j3c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j3c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j3c62 => "--:-:3:-:1  \@P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+
+                j4c48 => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j4c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j4c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j4c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+                j4c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j4c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j4c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j4c62 => "--:-:4:-:1  \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+
+                j5c48 => "10:-:-:-:1  \@P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>];\n",
+                j5c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>];\n",
+                j5c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>];\n",
+                j5c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>];\n",
+                j5c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>];\n",
+                j5c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>];\n",
+                j5c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>];\n",
+                j5c62 => "--:-:5:-:1  \@P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>];\n",
+
+                j6c48 => "20:-:-:-:1  \@P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>];\n",
+                j6c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>];\n",
+                j6c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>];\n",
+                j6c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>];\n",
+                j6c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>];\n",
+                j6c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>];\n",
+                j6c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>];\n",
+                j6c62 => "--:-:6:-:1  \@P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>];\n",
+            )
+        ),
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out = '';
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*16 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*16 + 08>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4
+--:-:-:-:1      LOP.AND tid15, tid, 15;
+--:-:-:-:1      SHR.U32 tid16, tid, 4;
+--:-:-:-:1      SHL     tid15, tid15, 2;
+--:-:-:-:1      ISCADD readCs, tid16, tid15, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*64 + tid15;
+--:-:-:-:1      ISCADD cx, blkB, tid15, 6;
+--:-:-:-:1      IADD   cx1, cx, 1;
+--:-:-:-:1      IADD   cx2, cx, 2;
+--:-:-:-:1      IADD   cx3, cx, 3;
+
+// cy = blkA*16 + tid16
+--:-:-:-:1      ISCADD cy, blkA, tid16, 4;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      SHL  ldc8, ldc, 4;
+
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C1,    ci, param_C[1], RZ, 1;
+
+// P0 = cx < n
+--:-:-:-:1      ISETP.LT.AND P0, PT, cx,  param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cx1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cx2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cx3, param_n, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+// P4 = cy < m
+--:-:-:-:1      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+// P5 = beta != 0 && P4
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P4;
+
+// P6 = Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+
+// Init beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:1:-:1  @P0 LDG.E.64 loadC, [C];
+    } : q{
+--:-:-:-:0 @!P0 MOV loadC0, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>];
+--:-:-:-:0 @!P1 MOV loadC1, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>];
+--:-:-:-:0 @!P2 MOV loadC2, RZ;
+--:-:-:-:1  @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>];
+--:-:-:-:0 @!P3 MOV loadC3, RZ;
+--:-:1:-:1  @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>];
+    };
++]
+
+// Restore output preds
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+
+--:-:-:-:1      LDS.U.128 part0C, [readCs + 4x<0*8*64>];
+--:-:2:-:1      LDS.U.128 part1C, [readCs + 4x<1*8*64>];
+--:-:-:-:1      LDS.U.128 part2C, [readCs + 4x<2*8*64>];
+--:-:3:-:1      LDS.U.128 part3C, [readCs + 4x<3*8*64>];
+--:-:-:-:1      LDS.U.128 part4C, [readCs + 4x<4*8*64>];
+--:-:4:-:1      LDS.U.128 part5C, [readCs + 4x<5*8*64>];
+--:-:-:-:1      LDS.U.128 part6C, [readCs + 4x<6*8*64>];
+--:-:5:-:1      LDS.U.128 part7C, [readCs + 4x<7*8*64>];
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1  @P0 FADD part0C0, part0C0, part1C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part1C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part1C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part1C3;
+
+04:-:-:-:1  @P0 FADD part2C0, part2C0, part3C0;
+--:-:-:-:1  @P1 FADD part2C1, part2C1, part3C1;
+--:-:-:-:1  @P2 FADD part2C2, part2C2, part3C2;
+--:-:-:-:1  @P3 FADD part2C3, part2C3, part3C3;
+
+08:-:-:-:1  @P0 FADD part4C0, part4C0, part5C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part5C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part5C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part5C3;
+
+10:-:-:-:1  @P0 FADD part6C0, part6C0, part7C0;
+--:-:-:-:1  @P1 FADD part6C1, part6C1, part7C1;
+--:-:-:-:1  @P2 FADD part6C2, part6C2, part7C2;
+--:-:-:-:1  @P3 FADD part6C3, part6C3, part7C3;
+
+--:-:-:-:1  @P0 FADD part0C0, part0C0, part2C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part2C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part2C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part2C3;
+
+--:-:-:-:1  @P0 FADD part4C0, part4C0, part6C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part6C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part6C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part6C3;
+
+--:-:-:-:1  @P0 FADD c0, part0C0, part4C0;
+--:-:-:-:1  @P1 FADD c1, part0C1, part4C1;
+--:-:-:-:1  @P2 FADD c2, part0C2, part4C2;
+--:-:-:-:1  @P3 FADD c3, part0C3, part4C3;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD cy, cy, 8;
+
+[+
+    our $vec;
+    return $vec ? q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0.H0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC0.H1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC1.H0;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC1.H1;
+    } : q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC2;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC3;
+    };
++]
+
+01:-:-:-:1  @P5 FFMA c0, b0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, b1, beta, c1;
+04:-:-:-:1  @P5 FFMA c2, b2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, b3, beta, c3;
+
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, cy, param_m, P5;
+
+--:-:1:-:1  @P0 F2F.F16.F32 c0, c0;
+--:-:2:-:1  @P1 F2F.F16.F32 c1, c1;
+
+--:-:-:-:0      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+--:-:3:-:1  @P2 F2F.F16.F32 c2, c2;
+--:-:4:-:1  @P3 F2F.F16.F32 c3, c3;
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:2  @P0 BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2  @P0 BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:1  @P0 STG.E.CG.64 [C], c;
+    } : q{
+01:-:-:-:1  @P0 STG.E.U16 [C + 2x<0>], c0;
+02:-:-:-:1  @P1 STG.E.U16 [C + 2x<1>], c1;
+04:-:-:-:1  @P2 STG.E.U16 [C + 2x<2>], c2;
+08:1:-:-:1  @P3 STG.E.U16 [C + 2x<3>], c3;
+    };
++]
+
+// Restore beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+01:-:-:-:6      IADD   C0.CC, C0, ldc8;
+--:-:-:-:0      IADD.X C1,    C1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_32x128.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_32x128.sass
new file mode 100644
index 0000000..8c4510d
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nn_32x128.sass
@@ -0,0 +1,562 @@
+# Kernel: hgemm_nn_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*16*2 + (32*16 + 32)*2>
+    szShareA : (32*16 + 32)
+    szShareB : (128*16)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ tidAX, tidBX, lda, ldb, ldb4, ldaz, ldbz, tid1, tid3, tid96, ta, tb0, tb1, tb2, tb3, xmad_ta, xmad_tb, shiftAX, tidAY<1-3>, tidBY<1-3>, txb<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadA<0-3>
+      84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>
+
+    100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+--:-:3:-:1      S2R blkA, SR_CTAID.Y;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb4,  ldb, 2;
+--:-:-:-:1      SHL ldb16, ldb, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidBX, tid,   31;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   5;
+
+// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 2
+04:-:-:-:1      ISCADD   txa, blkA, tidAX, 5;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 1;
+
+// trackB += (blkB*128 + tidBX + ldb*tidBY) * 4
+02:-:-:-:1      ISCADD   txb, blkB, tidBX, 7;
+--:-:-:-:1      XMAD.LO2 tb0, ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb0, ldbz, blkZ,  tb0;
+--:-:-:-:1      IADD     tb1, tb0, ldb4;
+--:-:-:-:1      IADD     tb2, tb1, ldb4;
+--:-:-:-:1      IADD     tb3, tb2, ldb4;
+
+--:-:-:-:1      LEA      track0B0.CC, tb0, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb0, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track1B0.CC, tb1, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb1, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track2B0.CC, tb2, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb2, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track3B0.CC, tb3, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb3, param_B[1], RZ, 1;
+
+// writeAs = (tidAY*32 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*128 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 7;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+01:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readBs, readBs, tid96;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidBY1, tidBY, 4;
+--:-:-:-:1      IADD tidBY2, tidBY, 8;
+--:-:-:-:1      IADD tidBY3, tidBY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidBY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.64 load0B, [track0B];
+--:-:2:-:1  @P1 LDG.E.CI.64 load1B, [track1B];
+--:-:3:-:1  @P2 LDG.E.CI.64 load2B, [track2B];
+--:-:4:-:1  @P3 LDG.E.CI.64 load3B, [track3B];
+--:-:5:-:1  @P4 LDG.E.CI.64 loadA,  [trackA];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.64 load0B, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.64 load1B, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.64 load2B, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 load3B, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.64 loadA,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:1:-:1  @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:1:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:1:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];
+--:-:5:-:1  @P1 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];
+--:-:5:-:1  @P2 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P0, RZ, k, 15;
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 16, P0;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+21:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:1:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+
+02:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+
+04:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:3:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:4:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+
+10:-:-:-:1      F2F.F32.F16 loadA3, loadA1.H1;
+--:-:-:-:1      F2F.F32.F16 loadA2, loadA1.H0;
+--:-:-:-:1      F2F.F32.F16 loadA1, loadA0.H1;
+--:-:5:-:1      F2F.F32.F16 loadA0, loadA0.H0;
+    } : q{
+21:-:-:-:1      F2F.F32.F16 load0B0, load0B0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:1:-:1      F2F.F32.F16 load0B3, load0B3;
+
+02:-:-:-:1      F2F.F32.F16 load1B0, load1B0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:2:-:1      F2F.F32.F16 load1B3, load1B3;
+
+04:-:-:-:1      F2F.F32.F16 load2B0, load2B0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:3:-:1      F2F.F32.F16 load2B3, load2B3;
+
+08:-:-:-:1      F2F.F32.F16 load3B0, load3B0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:4:-:1      F2F.F32.F16 load3B3, load3B3;
+
+10:-:-:-:1      F2F.F32.F16 loadA0, loadA0;
+--:-:-:-:1      F2F.F32.F16 loadA1, loadA1;
+--:-:-:-:1      F2F.F32.F16 loadA2, loadA2;
+--:-:5:-:1      F2F.F32.F16 loadA3, loadA3;
+    };
+</CODE>
+
+01:-:-:-:1      STS.128 [writeBs + 4x<0*128>], load0B;
+--:-:-:-:6      IADD   track0B0.CC, track0B0, ldb16;
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+02:-:-:-:1      STS.128 [writeBs + 4x<4*128>], load1B;
+--:-:-:-:6      IADD   track1B0.CC, track1B0, ldb16;
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+04:-:-:-:1      STS.128 [writeBs + 4x<8*128>], load2B;
+--:-:-:-:6      IADD   track2B0.CC, track2B0, ldb16;
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs + 4x<12*128>], load3B;
+--:-:-:-:6      IADD   track3B0.CC, track3B0, ldb16;
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+10:-:-:-:1      STS [writeAs + 4x<0*32>], loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*32>], loadA3;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.64 load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.64 load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.64 load2B, [track2B];
+--:-:5:-:1  @P5 LDG.E.CI.64 load3B, [track3B];
+--:-:6:-:1  @P6 LDG.E.CI.64 loadA,  [trackA];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*128>], load0B;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeBs + 4x< 4*128>], load1B;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*128>], load2B;\n",
+        j9c6   => "10:5:-:-:1  \@P0 STS.128 [writeBs + 4x<12*128>], load3B;\n",
+        j11c6  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<3*32>], loadA3;\n",
+        j11c8  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], loadA2;\n",
+        j11c10 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], loadA1;\n",
+        j11c12 => "--:6:-:-:1  \@P0 STS [writeAs + 4x<0*32>], loadA0;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0B0.CC, track0B0, ldb16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0B1,    track0B1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2B0.CC, track2B0, ldb16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2B1,    track2B1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3B0.CC, track3B0, ldb16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3B1,    track3B1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackA0.CC,  trackA0, 2x<16>;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackA1,     trackA1, RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.64 load0B, [track0B];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.64 load1B, [track1B];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.64 load2B, [track2B];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.64 load3B, [track3B];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.64 loadA,  [trackA];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j6c13  => "10:-:-:-:1  \@P5 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j6c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j6c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j6c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load2B0, load2B0.H0;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadA3, loadA1.H1;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA2, loadA1.H0;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA1, loadA0.H1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA0, loadA0.H0;\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0B3, load0B3;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1B0, load1B0;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B1, load1B1;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B2, load1B2;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1B3, load1B3;\n",
+
+                j6c13  => "10:-:-:-:1  \@P5 F2F.F32.F16 load2B0, load2B0;\n",
+                j6c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2B1, load2B1;\n",
+                j6c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2B2, load2B2;\n",
+                j6c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load2B3, load2B3;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B0, load3B0;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B1, load3B1;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B2, load3B2;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3B3, load3B3;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadA3, loadA3;\n",
+                j10c17 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA2, loadA2;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA1, loadA1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA0, loadA0;\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="hgemm_common_32x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nn_32x64.sass b/Kernel/SGEMM/Maxwell/hgemm_nn_32x64.sass
new file mode 100644
index 0000000..56b813f
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nn_32x64.sass
@@ -0,0 +1,913 @@
+# Kernel: hgemm_nn_32x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<32*33*2 + 64*32*2>
+    szShareA   : (32*33)
+    szShareB   : (64*32)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+      64-79 : j0Ay<0-7>, j0Bx<0-7>
+      80-95 : j1Ay<0-7>, j1Bx<0-7>
+
+      64-95 ~ lda, ldb, ldb8, tidAX, tidAY, tidBX, tidBY, tidAY<1-3>, tidBY<8|16|24>, tid1, tid32, tb, shiftAX, partialK, partialB, ldaz, ldbz, ta, txa, txb, txb<1-3>, xmad_ta, xmad_tb
+
+     96-119 :  load0A<0-7>,  load0B<0-3>,  load1B<0-3>,  load2B<0-3>,  load3B<0-3>
+    120-129 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    130-137 ~ swapBuf, readAs, readBs, writeAs, writeBs, k, ldb32
+    138-144 ~ tid, blkA, blkB, blkZ, writeCs, preds
+
+       0-15 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+      96-99 : loadC<0-3>
+    100-103 : b<0-3>
+    104-107 : c<0-3>
+    108-109 : C<0-1>
+    110-137 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15, tid16
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb8,  ldb, 3;
+--:-:-:-:1      SHL ldb32, ldb, 6;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 3
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,   2;
+--:-:-:-:1      LOP.AND tidAY,   tid,   3;
+--:-:-:-:1      SHL     shiftAX, tidAY, 3;
+--:-:-:-:1      SHL     tidAY,   tidAY, 3;
+
+// tidBX   = (tid & 15) << 2
+// tidBY   = tid >> 4
+01:-:-:-:1      LOP.AND tidBX, tid,  15;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   4;
+
+--:-:-:-:1      IADD tidBY8,  tidBY, 8;
+--:-:-:-:1      IADD tidBY16, tidBY, 16;
+--:-:-:-:1      IADD tidBY24, tidBY, 24;
+
+// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 5;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa, tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa, param_m, PT;
+
+// trackB += (blkB*64 + tidBX + ldb*tidBY) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
+--:-:-:-:1      XMAD.LO2 tb, ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb, ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      track0B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb,  tb,  ldb8;
+--:-:-:-:1      LEA      track1B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb,  tb,  ldb8;
+--:-:-:-:1      LEA      track2B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb,  tb,  ldb8;
+--:-:-:-:1      LEA      track3B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb, param_B[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb, param_n, PT;
+[+
+    our $vec;
+    return $vec ? '' : q{
+--:-:-:-:1      IADD txb1, txb, 1;
+--:-:-:-:1      IADD txb2, txb, 2;
+--:-:-:-:1      IADD txb3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb3, param_n, PT;
+    };
++]
+--:-:-:-:1      P2R preds, PR, RZ, 0x7c;
+
+// writeAs = (tidAY*32 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (tidBY*64 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA>, 2;
+
+// readAs = (((tid & 16) >> 2) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,   16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 2 bits at position 1
+--:-:-:-:1      SHL     readBs, readBs, 4;
+
+// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5)
+// tid32 = tid & -32
+--:-:-:-:1      LOP.AND tid32, tid, -32;
+
+// Write out the 4 groups of 32 rows 16 at a time
+// writeCs = (readAs + tid32/2*4) * 64 + readBs
+--:-:-:-:1      ISCADD writeCs, tid32,   readAs, 1;
+--:-:-:-:1      ISCADD writeCs, writeCs, readBs, 6;
+
+// Each block of 32 threads works on 8 lines,
+// readAs is also shifted over by 8 for each group of 32 threads
+// readAs += tid32/4 * 32 * 4 + tid32/4 * 4
+// readBs += tid32/4 * 64 * 4 + 4x<szShareA>
+--:-:-:-:1      ISCADD readAs, tid32,  readAs, 5;
+--:-:-:-:1      ISCADD readBs, tid32,  readBs, 6;
+--:-:-:-:1      IADD   readAs, tid32,  readAs;
+--:-:-:-:1      IADD   readBs, readBs, 4x<szShareA>;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareA + szShareB>;
+
+// If k is not a multiple of 32 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 32 then make a full 32 line fetch.
+--:-:-:-:1      LOP.AND.Z P0, partialK, k, 31;
+--:-:-:-:1  @P0 MOV partialK, 32;
+--:-:-:-:1      IADD k, k, -partialK;
+[+
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY,   partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY8,  partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY16, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY24, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidBY,   partialK, P3;
+<ORDERED>
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.64  load0B, [track0B];
+--:-:4:-:1  @P4 LDG.E.CI.64  load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.64  load2B, [track2B];
+--:-:6:-:1  @P6 LDG.E.CI.64  load3B, [track3B];
+</ORDERED>
+<ORDERED>
+--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.64  load0B, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.64  load1B, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.64  load2B, [addr_zero];
+--:-:1:-:1 @!P6 LDS.U.64  load3B, [addr_zero];
+</ORDERED>
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:2:-:1  @P6 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0A0, RZ;
+--:-:-:-:1 @!P4 MOV load0A1, RZ;
+--:-:-:-:1 @!P5 MOV load0A2, RZ;
+--:-:-:-:1 @!P6 MOV load0A3, RZ;
+
+--:-:-:-:1      IADD tidAY,  tidAY,  4;
+--:-:-:-:1      IADD tidAY1, tidAY1, 4;
+--:-:-:-:1      IADD tidAY2, tidAY2, 4;
+--:-:-:-:1      IADD tidAY3, tidAY3, 4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P6 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0A4, RZ;
+--:-:-:-:1 @!P4 MOV load0A5, RZ;
+--:-:-:-:1 @!P5 MOV load0A6, RZ;
+--:-:-:-:1 @!P6 MOV load0A7, RZ;
+
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0B0, RZ;
+--:-:-:-:1 @!P4 MOV load0B1, RZ;
+--:-:-:-:1 @!P5 MOV load0B2, RZ;
+--:-:-:-:1 @!P6 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY8, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P1 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load1B0, RZ;
+--:-:-:-:1 @!P4 MOV load1B1, RZ;
+--:-:-:-:1 @!P5 MOV load1B2, RZ;
+--:-:-:-:1 @!P6 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY16, partialK, PT;
+--:-:-:-:1  @P2 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P2 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load2B0, RZ;
+--:-:-:-:1 @!P4 MOV load2B1, RZ;
+--:-:-:-:1 @!P5 MOV load2B2, RZ;
+--:-:-:-:1 @!P6 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY24, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load3B0, RZ;
+--:-:-:-:1 @!P4 MOV load3B1, RZ;
+--:-:-:-:1 @!P5 MOV load3B2, RZ;
+--:-:-:-:1 @!P6 MOV load3B3, RZ;
+
+    };
++]
+// partialB = partialK * ldb
+--:-:-:-:1      XMAD.LO2 partialB, ldb, partialK, RZ;
+
+--:-:-:-:1      ISETP.GE.AND P1, PT, k, 32, PT;
+--:-:-:-:1      IADD k, k, -32;
+--:-:-:-:1  @P1 R2P PR, preds, 0x7c;
+--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c;
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:1      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+    } : q{
+02:-:-:-:1      F2F.F32.F16 load0A7, load0A7;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A6;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A5;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A4;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A3;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0;
+    };
++]
+--:-:-:-:0      LEA track0A0.CC, partialK, track0A0, 1;
+01:-:-:-:1      STS [writeAs + 4x<7*32>], load0A7;
+--:-:-:-:1      STS [writeAs + 4x<6*32>], load0A6;
+--:-:-:-:1      STS [writeAs + 4x<5*32>], load0A5;
+--:-:-:-:1      STS [writeAs + 4x<4*32>], load0A4;
+02:-:-:-:1      STS [writeAs + 4x<3*32>], load0A3;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], load0A2;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<0*32>], load0A0;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+04:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:3:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:4:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+
+10:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:5:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+
+20:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:6:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+    } : q{
+04:-:-:-:1      F2F.F32.F16 load0B0, load0B0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:3:-:1      F2F.F32.F16 load0B3, load0B3;
+
+08:-:-:-:1      F2F.F32.F16 load1B0, load1B0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:4:-:1      F2F.F32.F16 load1B3, load1B3;
+
+10:-:-:-:1      F2F.F32.F16 load2B0, load2B0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:5:-:1      F2F.F32.F16 load2B3, load2B3;
+
+20:-:-:-:1      F2F.F32.F16 load3B0, load3B0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:6:-:1      F2F.F32.F16 load3B3, load3B3;
+    };
++]
+
+--:-:-:-:0      LEA track0B0.CC, partialB, track0B0, 1;
+04:-:-:-:6      STS.128 [writeBs + 4x<0*64>], load0B;
+--:-:-:-:1      IADD.X track0B1, track0B1, RZ;
+
+--:-:-:-:0      LEA track1B0.CC, partialB, track1B0, 1;
+08:-:-:-:6      STS.128 [writeBs + 4x<8*64>], load1B;
+--:-:-:-:1      IADD.X track1B1, track1B1, RZ;
+
+--:-:-:-:0      LEA track2B0.CC, partialB, track2B0, 1;
+10:-:-:-:6      STS.128 [writeBs + 4x<16*64>], load2B;
+--:-:-:-:1      IADD.X track2B1, track2B1, RZ;
+
+--:-:-:-:0      LEA track3B0.CC, partialB, track3B0, 1;
+20:-:-:-:6      STS.128 [writeBs + 4x<24*64>], load3B;
+--:-:-:-:0      IADD.X track3B1, track3B1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>];
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.64  load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.64  load1B, [track1B];
+--:-:5:-:1  @P3 LDG.E.CI.64  load2B, [track2B];
+--:-:6:-:1  @P3 LDG.E.CI.64  load3B, [track3B];
+    } : q{
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+    };
++]
+
+LOOP:
+
+[+
+    our $vec;
+    our %insert =
+    (
+        j0c8   => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, RZ, PT;\n",
+        j0c10  => "--:-:-:-:1      ISETP.GE.AND P1, PT, k, 32, PT;\n" .
+                  "--:-:-:-:1      IADD k, k, -32;\n",
+
+        j0c23  => "--:-:-:-:1  \@P1 R2P PR, preds, 0x7c;\n",
+        j0c24  => "--:-:-:-:1 \@!P1 R2P PR, RZ,    0x7c;\n",
+
+        j2c32  => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<32>;\n",
+        j2c37  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j3c32  => "--:-:-:-:1  \@P3 IADD   track0B0.CC, track0B0, ldb32;\n",
+        j3c37  => "--:-:-:-:1  \@P3 IADD.X track0B1,    track0B1, RZ;\n",
+        j4c32  => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb32;\n",
+        j4c37  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j5c32  => "--:-:-:-:1  \@P3 IADD   track2B0.CC, track2B0, ldb32;\n",
+        j5c37  => "--:-:-:-:1  \@P3 IADD.X track2B1,    track2B1, RZ;\n",
+        j6c32  => "--:-:-:-:1  \@P3 IADD   track3B0.CC, track3B0, ldb32;\n",
+        j6c37  => "--:-:-:-:1  \@P3 IADD.X track3B1,    track3B1, RZ;\n",
+
+        j6c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j2c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<7*32>], load0A7;\n",
+        j2c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32>], load0A6;\n",
+        j2c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32>], load0A5;\n",
+        j2c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*32>], load0A4;\n",
+        j2c24  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*32>], load0A3;\n",
+        j2c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], load0A2;\n",
+        j2c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], load0A1;\n",
+        j2c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<0*32>], load0A0;\n",
+
+        j3c16  => "04:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*64>], load0B;\n",
+        j4c16  => "08:4:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*64>], load1B;\n",
+        j5c16  => "10:5:-:-:1  \@P0 STS.128 [writeBs + 4x<16*64>], load2B;\n",
+        j6c16  => "20:6:-:-:1  \@P0 STS.128 [writeBs + 4x<24*64>], load3B;\n",
+
+        ($vec ?
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j1c63 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j2c51 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j3c51 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j4c51 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n",
+
+                j5c51 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j2c61 => "02:-:2:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j3c61 => "04:-:3:-:1  \@P3 LDG.E.CI.64  load0B, [track0B];\n",
+                j4c61 => "08:-:4:-:1  \@P3 LDG.E.CI.64  load1B, [track1B];\n",
+                j5c61 => "10:-:5:-:1  \@P3 LDG.E.CI.64  load2B, [track2B];\n",
+                j6c61 => "20:-:6:-:1  \@P3 LDG.E.CI.64  load3B, [track3B];\n",
+            ) :
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A0, load0A0;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A1;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A2;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A3;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A4;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A5;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A6;\n",
+                j1c63 => "--:2:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A7;\n",
+
+                j2c51 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B3, load0B3;\n",
+
+                j3c51 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B0, load1B0;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B1;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B2;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B3, load1B3;\n",
+
+                j4c51 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B0, load2B0;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B1;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B2;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B3, load2B3;\n",
+
+                j5c51 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B0, load3B0;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B1;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B2;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B3, load3B3;\n",
+
+                j2c48 => "02:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j2c50 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j2c52 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j2c54 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+                j2c56 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n",
+                j2c58 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n",
+                j2c60 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n",
+                j2c62 => "--:-:2:-:1  \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n",
+
+                j3c56 => "04:-:-:-:1  \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j3c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j3c62 => "--:-:3:-:1  \@P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+
+                j4c56 => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j4c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j4c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j4c62 => "--:-:4:-:1  \@P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+
+                j5c56 => "10:-:-:-:1  \@P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j5c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j5c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j5c62 => "--:-:5:-:1  \@P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+
+                j6c56 => "20:-:-:-:1  \@P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j6c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j6c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j6c62 => "--:-:6:-:1  \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+            )
+        ),
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out = '';
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4
+--:-:-:-:1      LOP.AND tid15, tid,   15;
+--:-:-:-:1      SHR.U32 tid16, tid,    4;
+--:-:-:-:1      SHL     tid15, tid15,  2;
+--:-:-:-:1      ISCADD readCs, tid16,  tid15, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*64 + tid15;
+--:-:-:-:1      ISCADD cx, blkB, tid15, 6;
+--:-:-:-:1      IADD   cx1, cx, 1;
+--:-:-:-:1      IADD   cx2, cx, 2;
+--:-:-:-:1      IADD   cx3, cx, 3;
+
+// cy = blkA*32 + tid16
+--:-:-:-:1      ISCADD cy, blkA, tid16, 5;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      SHL  ldc8, ldc, 4;
+
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C1,    ci, param_C[1], RZ, 1;
+
+// P0 = cx < n
+--:-:-:-:1      ISETP.LT.AND P0, PT, cx,  param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cx1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cx2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cx3, param_n, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+// P4 = cy < m
+--:-:-:-:1      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+// P5 = beta != 0 && P4
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P4;
+
+// P6 = Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+
+// Init beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:1:-:1  @P0 LDG.E.64 loadC, [C];
+    } : q{
+--:-:-:-:0 @!P0 MOV loadC0, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>];
+--:-:-:-:0 @!P1 MOV loadC1, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>];
+--:-:-:-:0 @!P2 MOV loadC2, RZ;
+--:-:-:-:1  @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>];
+--:-:-:-:0 @!P3 MOV loadC3, RZ;
+--:-:1:-:1  @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>];
+    };
++]
+
+// Restore output preds
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+
+--:-:-:-:1      LDS.U.128 part0C, [readCs + 4x< 0*64>];
+--:-:2:-:1      LDS.U.128 part1C, [readCs + 4x<16*64>];
+--:-:-:-:1      LDS.U.128 part2C, [readCs + 4x<32*64>];
+--:-:3:-:1      LDS.U.128 part3C, [readCs + 4x<48*64>];
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1  @P0 FADD part0C0, part0C0, part1C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part1C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part1C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part1C3;
+
+04:-:-:-:1  @P0 FADD part2C0, part2C0, part3C0;
+--:-:-:-:1  @P1 FADD part2C1, part2C1, part3C1;
+--:-:-:-:1  @P2 FADD part2C2, part2C2, part3C2;
+--:-:-:-:1  @P3 FADD part2C3, part2C3, part3C3;
+
+--:-:-:-:1  @P0 FADD c0, part0C0, part2C0;
+--:-:-:-:1  @P1 FADD c1, part0C1, part2C1;
+--:-:-:-:1  @P2 FADD c2, part0C2, part2C2;
+--:-:-:-:1  @P3 FADD c3, part0C3, part2C3;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD cy, cy, 8;
+
+[+
+    our $vec;
+    return $vec ? q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0.H0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC0.H1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC1.H0;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC1.H1;
+    } : q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC2;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC3;
+    };
++]
+
+01:-:-:-:1  @P5 FFMA c0, b0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, b1, beta, c1;
+04:-:-:-:1  @P5 FFMA c2, b2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, b3, beta, c3;
+
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, cy, param_m, P5;
+
+--:-:1:-:1  @P0 F2F.F16.F32 c0, c0;
+--:-:2:-:1  @P1 F2F.F16.F32 c1, c1;
+
+--:-:-:-:0      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+--:-:3:-:1  @P2 F2F.F16.F32 c2, c2;
+
+--:-:-:-:0      LOP.XOR readCs, readCs, 4x<8*64>;
+
+--:-:4:-:1  @P3 F2F.F16.F32 c3, c3;
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:2  @P0 BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2  @P0 BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:1  @P0 STG.E.CG.64 [C], c;
+    } : q{
+01:-:-:-:1  @P0 STG.E.U16 [C + 2x<0>], c0;
+02:-:-:-:1  @P1 STG.E.U16 [C + 2x<1>], c1;
+04:-:-:-:1  @P2 STG.E.U16 [C + 2x<2>], c2;
+08:1:-:-:1  @P3 STG.E.U16 [C + 2x<3>], c3;
+    };
++]
+
+// Restore beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+01:-:-:-:6      IADD   C0.CC, C0, ldc8;
+--:-:-:-:0      IADD.X C1,    C1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nt_128x128.sass b/Kernel/SGEMM/Maxwell/hgemm_nt_128x128.sass
new file mode 100644
index 0000000..29a50f0
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nt_128x128.sass
@@ -0,0 +1,400 @@
+# Kernel: hgemm_nt_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+
+our $int16;
+
+our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+
+sub convert_in {return $convert;}
+
+
+sub int16_params {
+    return $int16 ? q{
+param_Stats[0]  : c[0x0][0x190]
+param_Stats[1]  : c[0x0][0x194]
+param_scale     : c[0x0][0x198]
+    } : "";
+}
+-]
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+    [+ int16_params() +]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid7, tid128, tid127, txa, txb, xmad_ta, xmad_tb, k1, k2, k3
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-107  : loadA<0-5>, loadB<0-5>
+
+    108-111 : trackA<0-1>, trackB<0-1>
+
+    112-118 ~ writeS, k, tidY, ta, tb, loop
+    119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ
+
+    64-75   ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128
+
+    64-79   : c<0-7>, d3, d2, d1, d0, cs<0-3>
+    64-65   : Stats<0-1>
+    80-89   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    90-118  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,  param_k;
+--:-:-:-:1      LOP.AND tid1, tid,  1;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+--:-:-:-:1      MOV loop, RZ;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
++]
+
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// tidY = tid1 << 2
+--:-:-:-:1      SHL tidY, tid1, 2;
+
+// tidX = tid >> 1
+01:-:-:-:1      SHR.U32 tidX,  tid,   1;
+
+// trackA += 2 * ((blkA*128 + tidX) * lda + tidY)
+02:-:-:-:1      ISCADD  txa, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa,  tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x1;
+
+// trackB += 2 * ((blkB*128 + tidX) * ldb + tidY)
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 7;
+--:-:-:-:1      XMAD.LO  tb,  ldb,  txb,  tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = 4 * (128 * tidY + tidX)
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      ISCADD  writeS, writeS, 4x<128*8*2>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+[+
+    our $vec;
+    return $vec ? q{
+// k must be multiple of 8
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+--:-:2:-:1  @P5 LDG.E.CI.64 loadA0, [trackA + 2x<0>];
+--:-:2:-:1  @P5 LDG.E.CI.64 loadA4, [trackA + 2x<8>];
+--:-:4:-:1  @P6 LDG.E.CI.64 loadB0, [trackB + 2x<0>];
+--:5:6:-:1  @P6 LDG.E.CI.64 loadB4, [trackB + 2x<8>];
+
+--:-:3:-:1 @!P5 LDS.U.64    loadA0, [addr_zero];
+--:-:3:-:1 @!P5 LDS.U.64    loadA4, [addr_zero];
+--:-:3:-:1 @!P6 LDS.U.64    loadB0, [addr_zero];
+--:-:3:-:2 @!P6 LDS.U.64    loadB4, [addr_zero];
+
+    // Vec 4 and scalar loads
+    } : q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD k1, tidY, 1;
+--:-:-:-:1      IADD k2, tidY, 2;
+--:-:-:-:1      IADD k3, tidY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P6;
+
+--:-:4:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:1      LOP.AND.NZ P4, RZ, k, 7;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 8, P4;
+</SCHEDULE_BLOCK>
+    };
++]
+
+[+
+    our $vec;
+    our $convert;
+    return $vec ? qq{
+
+06:-:1:-:4      $convert loadA3, loadA1.H1;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<16>;
+--:-:2:-:4      $convert loadA2, loadA1.H0;
+--:-:-:-:4      $convert loadA1, loadA0.H1;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+--:-:3:-:1      $convert loadA0, loadA0.H0;
+
+01:-:-:-:1      STS [writeS + 4x<3*128>], loadA3;
+02:-:-:-:1      STS [writeS + 4x<2*128>], loadA2;
+04:-:-:-:1      STS [writeS + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeS + 4x<0*128>], loadA0;
+
+08:-:1:-:4      $convert loadB3, loadB1.H1;
+10:-:-:-:0      IADD   trackB0.CC, trackB0, 2x<16>;
+--:-:2:-:4      $convert loadB2, loadB1.H0;
+--:-:3:-:4      $convert loadB1, loadB0.H1;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+--:-:4:-:1      $convert loadB0, loadB0.H0;
+
+01:-:-:-:1      STS [writeS + 4x<11*128>], loadB3;
+02:-:-:-:1      STS [writeS + 4x<10*128>], loadB2;
+04:-:-:-:1      STS [writeS + 4x< 9*128>], loadB1;
+08:-:-:-:1      STS [writeS + 4x< 8*128>], loadB0;
+
+    // scalar loads
+    } : qq{
+
+02:-:-:-:4      $convert loadA0, loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<8>;
+--:-:2:-:4      $convert loadA1, loadA1;
+--:-:-:-:4      $convert loadA2, loadA2;
+--:-:3:-:1      $convert loadA3, loadA3;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS [writeS + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeS + 4x<1*128>], loadA1;
+04:-:-:-:1      STS [writeS + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeS + 4x<3*128>], loadA3;
+
+08:-:-:-:4      $convert loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, 2x<8>;
+--:-:2:-:4      $convert loadB1, loadB1;
+--:-:-:-:4      $convert loadB2, loadB2;
+--:-:3:-:1      $convert loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+02:-:-:-:1      STS [writeS + 4x< 8*128>], loadB0;
+--:-:-:-:1      STS [writeS + 4x< 9*128>], loadB1;
+04:-:-:-:1      STS [writeS + 4x<10*128>], loadB2;
+--:-:-:-:1      STS [writeS + 4x<11*128>], loadB3;
+    };
++]
+
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+
+[+
+    our $vec;
+    our $convert;
+    our @top = $vec ?
+        ("--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n") :
+        ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, 16, P5;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+        j0c13 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P5;\n",
+        j0c15 => "--:-:-:-:1      PSETP.AND.AND P3, PT, P0, P1, P6;\n",
+
+        j0c27 => "--:-:2:-:1  \@P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];\n",
+        j0c29 => "--:-:2:-:1  \@P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];\n",
+        j0c31 => "--:-:4:-:1  \@P3 LDG.E.CI.64 loadB0, [trackB + 2x<0>];\n",
+        j0c33 => "20:5:6:-:1  \@P3 LDG.E.CI.64 loadB4, [trackB + 2x<8>];\n",
+
+        j3c5  => "--:-:-:-:1 \@!P1 $convert loadA3, loadA5.H1;\n",
+        j3c9  => "--:-:-:-:1 \@!P1 $convert loadA2, loadA5.H0;\n",
+        j3c13 => "--:-:-:-:1 \@!P1 $convert loadA1, loadA4.H1;\n",
+        j3c17 => "--:-:-:-:1 \@!P1 $convert loadA0, loadA4.H0;\n",
+
+        j4c5  => "--:-:-:-:1 \@!P1 $convert loadB3, loadB5.H1;\n",
+        j4c9  => "--:-:-:-:1 \@!P1 $convert loadB2, loadB5.H0;\n",
+        j4c13 => "--:-:-:-:1 \@!P1 $convert loadB1, loadB4.H1;\n",
+        j4c17 => "--:-:-:-:1 \@!P1 $convert loadB0, loadB4.H0;\n",
+
+        j5c5  => "02:-:-:-:1  \@P1 $convert loadA3, loadA1.H1;\n",
+        j5c9  => "--:-:2:-:1  \@P1 $convert loadA2, loadA1.H0;\n",
+        j5c13 => "--:-:-:-:1  \@P1 $convert loadA1, loadA0.H1;\n",
+        j5c17 => "--:-:3:-:1  \@P1 $convert loadA0, loadA0.H0;\n",
+
+        j5c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x<3*128>], loadA3;\n",
+        j5c31 => "--:-:-:-:1  \@P0 STS [writeS + 4x<2*128>], loadA2;\n",
+        j5c33 => "04:-:-:-:1  \@P0 STS [writeS + 4x<1*128>], loadA1;\n",
+        j5c35 => "--:-:-:-:1  \@P0 STS [writeS + 4x<0*128>], loadA0;\n",
+
+        j6c5  => "08:-:-:-:1  \@P1 $convert loadB3, loadB1.H1;\n",
+        j6c9  => "--:-:2:-:1  \@P1 $convert loadB2, loadB1.H0;\n",
+        j6c13 => "--:-:3:-:1  \@P1 $convert loadB1, loadB0.H1;\n",
+        j6c17 => "--:-:4:-:1  \@P1 $convert loadB0, loadB0.H0;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x<11*128>], loadB3;\n",
+        j6c31 => "--:-:-:-:1  \@P0 STS [writeS + 4x<10*128>], loadB2;\n",
+        j6c33 => "04:-:-:-:1  \@P0 STS [writeS + 4x< 9*128>], loadB1;\n",
+        j6c35 => "08:-:-:-:1  \@P0 STS [writeS + 4x< 8*128>], loadB0;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 2x<16>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c46 => "10:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, 2x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+            ) :
+            (
+        j0c7  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 16, P6;\n",
+        j0c8  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n",
+
+        j0c29 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j0c31 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j0c33 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j0c35 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j5c5  => "02:-:2:-:1  \@P2 $convert loadA0, loadA0;\n",
+        j5c9  => "--:-:3:-:1  \@P2 $convert loadA1, loadA1;\n",
+        j5c13 => "--:-:4:-:1  \@P2 $convert loadA2, loadA2;\n",
+        j5c17 => "--:-:5:-:1  \@P2 $convert loadA3, loadA3;\n",
+
+        j5c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x<0*128>], loadA0;\n",
+        j5c31 => "04:-:-:-:1  \@P0 STS [writeS + 4x<1*128>], loadA1;\n",
+        j5c33 => "08:-:-:-:1  \@P0 STS [writeS + 4x<2*128>], loadA2;\n",
+        j5c35 => "10:-:-:-:1  \@P0 STS [writeS + 4x<3*128>], loadA3;\n",
+
+        j6c5  => "20:-:2:-:1  \@P3 $convert loadB0, loadB0;\n",
+        j6c9  => "--:-:3:-:1  \@P3 $convert loadB1, loadB1;\n",
+        j6c13 => "--:-:4:-:1  \@P3 $convert loadB2, loadB2;\n",
+        j6c17 => "--:-:5:-:1  \@P3 $convert loadB3, loadB3;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x< 8*128>], loadB0;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS [writeS + 4x< 9*128>], loadB1;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS [writeS + 4x<10*128>], loadB2;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS [writeS + 4x<11*128>], loadB3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 2x<8>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c46 => "10:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, 2x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+            )
+        ),
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1      IADD32I k, k, -8;\n",
+    );
+    return;
++]
+
+<INCLUDE file="hgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nt_16x64.sass b/Kernel/SGEMM/Maxwell/hgemm_nt_16x64.sass
new file mode 100644
index 0000000..ce5e6ef
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nt_16x64.sass
@@ -0,0 +1,1185 @@
+# Kernel: hgemm_nt_16x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(16*64 + 32)*2 + (64*64 + 32)*2>
+    szShareA   : (16*64 + 32)
+    szShareB   : (64*64 + 32)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+      64-79 : j0Ay<0-7>, j0Bx<0-7>
+      80-95 : j1Ay<0-7>, j1Bx<0-7>
+
+      64-95 ~ tidX, tidY, tidY<1-3>, lda, ldb, ldaz, ldbz, ldb16, tid16_8, ta, txa, tb<00|16|32|48>, txb<00|16|32|48>, xmad_ta, xmad_tb, shiftX, predsY0, predsY4, partialK
+
+     96-135 :  load0A<0-7>,  load0B<0-7>,  load1B<0-7>,  load2B<0-7>,  load3B<0-7>
+    136-145 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    146-152 ~ swapBuf, readAs, readBs, writeAs, writeBs, k
+    153-159 ~ tid, blkA, blkB, blkZ, writeCs, preds, tid16
+
+       0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3>
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+      96-99 : loadC<0-3>
+    100-103 : b<0-3>
+    104-107 : c<0-3>
+    108-109 : C<0-1>
+    110-152 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb16, ldb, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// tidX   = tid >> 3
+// tidY   = (tid & 7) << 3
+// shiftX = (tid & 7) << 2
+01:-:-:-:1      SHR.U32 tidX, tid,  3;
+--:-:-:-:1      LOP.AND tidY, tid,  7;
+--:-:-:-:1      SHL     shiftX, tidY, 2;
+--:-:-:-:1      SHL     tidY,   tidY, 3;
+
+// trackA += ((blkA*16 + tidX) * lda + tidY) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidX, 4;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa, tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa, param_m, PT;
+
+// trackB += ((blkB*64 + tidX) * ldb + tidY) * 2
+04:-:-:-:1      ISCADD   txb00, blkB, tidX, 6;
+--:-:-:-:1      IADD     txb16, txb00, 16;
+--:-:-:-:1      IADD     txb32, txb00, 32;
+--:-:-:-:1      IADD     txb48, txb00, 48;
+--:-:-:-:1      XMAD.LO  tb00, ldb,  txb00, tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb00, ldbz, blkZ, tb00;
+--:-:-:-:1      IADD     tb16, tb00, ldb16;
+--:-:-:-:1      IADD     tb32, tb16, ldb16;
+--:-:-:-:1      IADD     tb48, tb32, ldb16;
+--:-:-:-:1      LEA      track0B0.CC, tb00, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb00, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track1B0.CC, tb16, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb16, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track2B0.CC, tb32, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb32, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track3B0.CC, tb48, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb48, param_B[1], RZ, 1;
+
+
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb16, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb48, param_n, PT;
+
+--:-:-:-:1      P2R preds, PR, RZ, 0x7c;
+
+// writeAs = (tidY*16 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 4;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftX;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (tidY*64 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 6;
+--:-:-:-:1      IADD   writeBs, writeBs, shiftX;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA>, 2;
+
+// readAs = (tid & 1) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHL     readBs, readBs, 4;
+
+// tid16 = tid & -16
+// tid16_8 = tid16 / 2 * 4
+--:-:-:-:1      LOP.AND tid16, tid, -16;
+--:-:-:-:1      SHL     tid16_8, tid16, 1;
+
+// writeCs = (readAs + tid16*2) * 64 + readBs;
+--:-:-:-:1      ISCADD writeCs, tid16,   readAs, 1;
+--:-:-:-:1      ISCADD writeCs, writeCs, readBs, 6;
+
+// Each block of 16 threads works on 8 lines, shifted over by 4
+// readAs += tid16_8 * 16 + tid16
+// readBs += tid16_8 * 64 + tid16 + 4x<szShareA>
+--:-:-:-:1      ISCADD readAs, tid16_8, readAs, 4;
+--:-:-:-:1      ISCADD readBs, tid16_8, readBs, 6;
+--:-:-:-:1      IADD   readAs, tid16, readAs;
+--:-:-:-:1      IADD3  readBs, tid16, 4x<szShareA>, readBs;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareA + szShareB>;
+
+// If k is not a multiple of 64 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 64 then make a full 64 line fetch.
+--:-:-:-:1      LOP.AND.Z P0, partialK, k, 63;
+--:-:-:-:1  @P0 MOV partialK, 64;
+--:-:-:-:1      IADD k, k, -partialK;
+[+
+    our $vec;
+    return $vec ? q{
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x7c;
+--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c;
+
+<ORDERED>
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.128 load0B, [track0B];
+--:-:4:-:1  @P4 LDG.E.CI.128 load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load2B, [track2B];
+--:-:6:-:1  @P6 LDG.E.CI.128 load3B, [track3B];
+</ORDERED>
+
+<ORDERED>
+--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.128 load0B, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128 load1B, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.128 load2B, [addr_zero];
+--:-:1:-:1 @!P6 LDS.U.128 load3B, [addr_zero];
+</ORDERED>
+
+    } : q{
+--:-:-:-:1      IADD tidY1, tidY, 1;
+--:-:-:-:1      IADD tidY2, tidY, 2;
+--:-:-:-:1      IADD tidY3, tidY, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, partialK, PT;
+--:-:-:-:1      P2R predsY0, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD tidY,  tidY,  4;
+--:-:-:-:1      IADD tidY1, tidY1, 4;
+--:-:-:-:1      IADD tidY2, tidY2, 4;
+--:-:-:-:1      IADD tidY3, tidY3, 4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, partialK, PT;
+--:-:-:-:1      P2R predsY4, PR, RZ, 0x0f;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa, param_m, PT;
+--:-:-:-:1  @P4 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1  @P4 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A4, RZ;
+--:-:-:-:1 @!P1 MOV load0A5, RZ;
+--:-:-:-:1 @!P2 MOV load0A6, RZ;
+--:-:-:-:1 @!P3 MOV load0A7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb00, param_n, PT;
+--:-:-:-:1  @P5 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1  @P5 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B4, RZ;
+--:-:-:-:1 @!P1 MOV load0B5, RZ;
+--:-:-:-:1 @!P2 MOV load0B6, RZ;
+--:-:-:-:1 @!P3 MOV load0B7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb16, param_n, PT;
+--:-:-:-:1  @P6 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1  @P6 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B4, RZ;
+--:-:-:-:1 @!P1 MOV load1B5, RZ;
+--:-:-:-:1 @!P2 MOV load1B6, RZ;
+--:-:-:-:1 @!P3 MOV load1B7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb32, param_n, PT;
+--:-:-:-:1  @P4 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1  @P4 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load2B4, [track2B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load2B5, [track2B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load2B6, [track2B + 2x<6>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 load2B7, [track2B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B4, RZ;
+--:-:-:-:1 @!P1 MOV load2B5, RZ;
+--:-:-:-:1 @!P2 MOV load2B6, RZ;
+--:-:-:-:1 @!P3 MOV load2B7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb48, param_n, PT;
+--:-:-:-:1  @P6 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:6:-:1  @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1  @P6 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load3B4, [track3B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load3B5, [track3B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load3B6, [track3B + 2x<6>];
+--:-:6:-:1  @P3 LDG.E.CI.U16 load3B7, [track3B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B4, RZ;
+--:-:-:-:1 @!P1 MOV load3B5, RZ;
+--:-:-:-:1 @!P2 MOV load3B6, RZ;
+--:-:-:-:1 @!P3 MOV load3B7, RZ;
+    };
++]
+--:-:-:-:1      SHL partialK, partialK, 1;
+
+--:-:-:-:1      ISETP.GE.AND P0, PT, k, 64, PT;
+--:-:-:-:1      IADD k, k, -64;
+--:-:-:-:1  @P0 R2P PR, preds, 0x7c;
+--:-:-:-:1 @!P0 R2P PR, RZ, 0x7c;
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:1      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+    } : q{
+02:-:-:-:1      F2F.F32.F16 load0A7, load0A7;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A6;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A5;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A4;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A3;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0;
+    };
++]
+--:-:-:-:0      IADD   track0A0.CC, track0A0, partialK;
+01:-:-:-:1      STS [writeAs + 4x<7*16>], load0A7;
+--:-:-:-:1      STS [writeAs + 4x<6*16>], load0A6;
+--:-:-:-:1      STS [writeAs + 4x<5*16>], load0A5;
+--:-:-:-:1      STS [writeAs + 4x<4*16>], load0A4;
+02:-:-:-:1      STS [writeAs + 4x<3*16>], load0A3;
+--:-:-:-:1      STS [writeAs + 4x<2*16>], load0A2;
+--:-:-:-:1      STS [writeAs + 4x<1*16>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<0*16>], load0A0;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+04:-:-:-:1      F2F.F32.F16 load0B7, load0B3.H1;
+--:-:-:-:1      F2F.F32.F16 load0B6, load0B3.H0;
+--:-:-:-:1      F2F.F32.F16 load0B5, load0B2.H1;
+--:-:1:-:1      F2F.F32.F16 load0B4, load0B2.H0;
+--:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:2:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+    } : q{
+04:-:-:-:1      F2F.F32.F16 load0B7, load0B7;
+--:-:-:-:1      F2F.F32.F16 load0B6, load0B6;
+--:-:-:-:1      F2F.F32.F16 load0B5, load0B5;
+--:-:1:-:1      F2F.F32.F16 load0B4, load0B4;
+--:-:-:-:1      F2F.F32.F16 load0B3, load0B3;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:2:-:1      F2F.F32.F16 load0B0, load0B0;
+    };
++]
+--:-:-:-:0      IADD   track0B0.CC, track0B0, partialK;
+01:-:-:-:1      STS [writeBs + 4x<7*64 + 0*16>], load0B7;
+--:-:-:-:1      STS [writeBs + 4x<6*64 + 0*16>], load0B6;
+--:-:-:-:1      STS [writeBs + 4x<5*64 + 0*16>], load0B5;
+--:-:-:-:1      STS [writeBs + 4x<4*64 + 0*16>], load0B4;
+02:-:-:-:1      STS [writeBs + 4x<3*64 + 0*16>], load0B3;
+--:-:-:-:1      STS [writeBs + 4x<2*64 + 0*16>], load0B2;
+--:-:-:-:1      STS [writeBs + 4x<1*64 + 0*16>], load0B1;
+--:-:-:-:1      STS [writeBs + 4x<0*64 + 0*16>], load0B0;
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+08:-:-:-:1      F2F.F32.F16 load1B7, load1B3.H1;
+--:-:-:-:1      F2F.F32.F16 load1B6, load1B3.H0;
+--:-:-:-:1      F2F.F32.F16 load1B5, load1B2.H1;
+--:-:1:-:1      F2F.F32.F16 load1B4, load1B2.H0;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+    } : q{
+08:-:-:-:1      F2F.F32.F16 load1B7, load1B7;
+--:-:-:-:1      F2F.F32.F16 load1B6, load1B6;
+--:-:-:-:1      F2F.F32.F16 load1B5, load1B5;
+--:-:1:-:1      F2F.F32.F16 load1B4, load1B4;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B3;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0;
+    };
++]
+--:-:-:-:0      IADD   track1B0.CC, track1B0, partialK;
+01:-:-:-:1      STS [writeBs + 4x<7*64 + 1*16>], load1B7;
+--:-:-:-:1      STS [writeBs + 4x<6*64 + 1*16>], load1B6;
+--:-:-:-:1      STS [writeBs + 4x<5*64 + 1*16>], load1B5;
+--:-:-:-:1      STS [writeBs + 4x<4*64 + 1*16>], load1B4;
+02:-:-:-:1      STS [writeBs + 4x<3*64 + 1*16>], load1B3;
+--:-:-:-:1      STS [writeBs + 4x<2*64 + 1*16>], load1B2;
+--:-:-:-:1      STS [writeBs + 4x<1*64 + 1*16>], load1B1;
+--:-:-:-:1      STS [writeBs + 4x<0*64 + 1*16>], load1B0;
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+10:-:-:-:1      F2F.F32.F16 load2B7, load2B3.H1;
+--:-:-:-:1      F2F.F32.F16 load2B6, load2B3.H0;
+--:-:-:-:1      F2F.F32.F16 load2B5, load2B2.H1;
+--:-:1:-:1      F2F.F32.F16 load2B4, load2B2.H0;
+--:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:2:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+    } : q{
+10:-:-:-:1      F2F.F32.F16 load2B7, load2B7;
+--:-:-:-:1      F2F.F32.F16 load2B6, load2B6;
+--:-:-:-:1      F2F.F32.F16 load2B5, load2B5;
+--:-:1:-:1      F2F.F32.F16 load2B4, load2B4;
+--:-:-:-:1      F2F.F32.F16 load2B3, load2B3;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:2:-:1      F2F.F32.F16 load2B0, load2B0;
+    };
++]
+--:-:-:-:0      IADD   track2B0.CC, track2B0, partialK;
+01:-:-:-:1      STS [writeBs + 4x<7*64 + 2*16>], load2B7;
+--:-:-:-:1      STS [writeBs + 4x<6*64 + 2*16>], load2B6;
+--:-:-:-:1      STS [writeBs + 4x<5*64 + 2*16>], load2B5;
+--:-:-:-:1      STS [writeBs + 4x<4*64 + 2*16>], load2B4;
+02:-:-:-:1      STS [writeBs + 4x<3*64 + 2*16>], load2B3;
+--:-:-:-:1      STS [writeBs + 4x<2*64 + 2*16>], load2B2;
+--:-:-:-:1      STS [writeBs + 4x<1*64 + 2*16>], load2B1;
+--:-:-:-:1      STS [writeBs + 4x<0*64 + 2*16>], load2B0;
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+20:-:-:-:1      F2F.F32.F16 load3B7, load3B3.H1;
+--:-:-:-:1      F2F.F32.F16 load3B6, load3B3.H0;
+--:-:-:-:1      F2F.F32.F16 load3B5, load3B2.H1;
+--:-:1:-:1      F2F.F32.F16 load3B4, load3B2.H0;
+--:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:2:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+    } : q{
+20:-:-:-:1      F2F.F32.F16 load3B7, load3B7;
+--:-:-:-:1      F2F.F32.F16 load3B6, load3B6;
+--:-:-:-:1      F2F.F32.F16 load3B5, load3B5;
+--:-:1:-:1      F2F.F32.F16 load3B4, load3B4;
+--:-:-:-:1      F2F.F32.F16 load3B3, load3B3;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:2:-:1      F2F.F32.F16 load3B0, load3B0;
+    };
++]
+--:-:-:-:0      IADD   track3B0.CC, track3B0, partialK;
+01:-:-:-:1      STS [writeBs + 4x<7*64 + 3*16>], load3B7;
+--:-:-:-:1      STS [writeBs + 4x<6*64 + 3*16>], load3B6;
+--:-:-:-:1      STS [writeBs + 4x<5*64 + 3*16>], load3B5;
+--:-:-:-:1      STS [writeBs + 4x<4*64 + 3*16>], load3B4;
+02:-:-:-:1      STS [writeBs + 4x<3*64 + 3*16>], load3B3;
+--:-:-:-:1      STS [writeBs + 4x<2*64 + 3*16>], load3B2;
+--:-:-:-:1      STS [writeBs + 4x<1*64 + 3*16>], load3B1;
+--:-:-:-:1      STS [writeBs + 4x<0*64 + 3*16>], load3B0;
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*16 + 00>];
+--:-:-:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*16 + 08>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>];
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.128 load0B, [track0B];
+--:-:4:-:1  @P4 LDG.E.CI.128 load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load2B, [track2B];
+--:-:6:-:1  @P6 LDG.E.CI.128 load3B, [track3B];
+    } : q{
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];
+
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];
+--:-:4:-:1  @P4 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];
+
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B4, [track2B + 2x<4>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B5, [track2B + 2x<5>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B6, [track2B + 2x<6>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B7, [track2B + 2x<7>];
+
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B4, [track3B + 2x<4>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B5, [track3B + 2x<5>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B6, [track3B + 2x<6>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load3B7, [track3B + 2x<7>];
+    };
++]
+
+LOOP:
+
+[+
+    our $vec;
+    our %insert =
+    (
+        j0c8   => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, RZ, PT;\n",
+        j0c10  => "--:-:-:-:1      ISETP.GE.AND P1, PT, k, 64, PT;\n" .
+                  "--:-:-:-:1      IADD k, k, -64;\n",
+
+        j0c23  => "--:-:-:-:1  \@P1 R2P PR, preds, 0x7c;\n",
+        j0c24  => "--:-:-:-:1 \@!P1 R2P PR, RZ,    0x7c;\n",
+
+        j2c32  => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<64>;\n",
+        j2c37  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j3c32  => "--:-:-:-:1  \@P3 IADD   track0B0.CC, track0B0, 2x<64>;\n",
+        j3c37  => "--:-:-:-:1  \@P3 IADD.X track0B1,    track0B1, RZ;\n",
+        j4c32  => "--:-:-:-:1  \@P4 IADD   track1B0.CC, track1B0, 2x<64>;\n",
+        j4c37  => "--:-:-:-:1  \@P4 IADD.X track1B1,    track1B1, RZ;\n",
+        j5c32  => "--:-:-:-:1  \@P5 IADD   track2B0.CC, track2B0, 2x<64>;\n",
+        j5c37  => "--:-:-:-:1  \@P5 IADD.X track2B1,    track2B1, RZ;\n",
+        j6c32  => "--:-:-:-:1  \@P6 IADD   track3B0.CC, track3B0, 2x<64>;\n",
+        j6c37  => "--:-:-:-:1  \@P6 IADD.X track3B1,    track3B1, RZ;\n",
+
+        j6c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j1c63 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j2c36 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B7, load0B3.H1;\n",
+                j2c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B6, load0B3.H0;\n",
+                j2c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B5, load0B2.H1;\n",
+                j2c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B4, load0B2.H0;\n",
+                j2c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j3c36 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B7, load1B3.H1;\n",
+                j3c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B6, load1B3.H0;\n",
+                j3c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B5, load1B2.H1;\n",
+                j3c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B4, load1B2.H0;\n",
+                j3c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j4c36 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B7, load2B3.H1;\n",
+                j4c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B6, load2B3.H0;\n",
+                j4c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B5, load2B2.H1;\n",
+                j4c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B4, load2B2.H0;\n",
+                j4c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n",
+
+                j5c36 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B7, load3B3.H1;\n",
+                j5c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B6, load3B3.H0;\n",
+                j5c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B5, load3B2.H1;\n",
+                j5c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B4, load3B2.H0;\n",
+                j5c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j2c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<7*16>], load0A7;\n",
+                j2c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*16>], load0A6;\n",
+                j2c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*16>], load0A5;\n",
+                j2c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*16>], load0A4;\n",
+                j2c24  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*16>], load0A3;\n",
+                j2c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*16>], load0A2;\n",
+                j2c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*16>], load0A1;\n",
+                j2c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<0*16>], load0A0;\n",
+
+                j3c16  => "04:-:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 0*16>], load0B7;\n",
+                j3c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 0*16>], load0B6;\n",
+                j3c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 0*16>], load0B5;\n",
+                j3c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 0*16>], load0B4;\n",
+                j3c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 0*16>], load0B3;\n",
+                j3c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 0*16>], load0B2;\n",
+                j3c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 0*16>], load0B1;\n",
+                j3c30  => "--:3:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 0*16>], load0B0;\n",
+
+                j4c16  => "08:-:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 1*16>], load1B7;\n",
+                j4c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 1*16>], load1B6;\n",
+                j4c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 1*16>], load1B5;\n",
+                j4c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 1*16>], load1B4;\n",
+                j4c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 1*16>], load1B3;\n",
+                j4c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 1*16>], load1B2;\n",
+                j4c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 1*16>], load1B1;\n",
+                j4c30  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 1*16>], load1B0;\n",
+
+                j5c16  => "10:-:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 2*16>], load2B7;\n",
+                j5c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 2*16>], load2B6;\n",
+                j5c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 2*16>], load2B5;\n",
+                j5c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 2*16>], load2B4;\n",
+                j5c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 2*16>], load2B3;\n",
+                j5c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 2*16>], load2B2;\n",
+                j5c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 2*16>], load2B1;\n",
+                j5c30  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 2*16>], load2B0;\n",
+
+                j6c16  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 3*16>], load3B7;\n",
+                j6c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 3*16>], load3B6;\n",
+                j6c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 3*16>], load3B5;\n",
+                j6c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 3*16>], load3B4;\n",
+                j6c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 3*16>], load3B3;\n",
+                j6c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 3*16>], load3B2;\n",
+                j6c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 3*16>], load3B1;\n",
+                j6c30  => "--:6:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 3*16>], load3B0;\n",
+
+                j2c61 => "02:-:2:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j3c61 => "04:-:3:-:1  \@P3 LDG.E.CI.128 load0B, [track0B];\n",
+                j4c61 => "08:-:4:-:1  \@P4 LDG.E.CI.128 load1B, [track1B];\n",
+                j5c61 => "10:-:5:-:1  \@P5 LDG.E.CI.128 load2B, [track2B];\n",
+                j6c61 => "20:-:6:-:1  \@P6 LDG.E.CI.128 load3B, [track3B];\n",
+            ) :
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A0, load0A0;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A1;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A2;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A3;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A4;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A5;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A6;\n",
+                j1c63 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A7, load0A7;\n",
+
+                j2c36 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B3;\n",
+                j2c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B4, load0B4;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B5, load0B5;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B6, load0B6;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B7, load0B7;\n",
+
+                j3c36 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B0, load1B0;\n",
+                j3c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B1;\n",
+                j3c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B2;\n",
+                j3c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B3;\n",
+                j3c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B4, load1B4;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B5, load1B5;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B6, load1B6;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B7, load1B7;\n",
+
+                j4c36 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B0, load2B0;\n",
+                j4c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B1;\n",
+                j4c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B2;\n",
+                j4c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B3;\n",
+                j4c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B4, load2B4;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B5, load2B5;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B6, load2B6;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B7, load2B7;\n",
+
+                j5c36 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B0, load3B0;\n",
+                j5c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B1;\n",
+                j5c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B2;\n",
+                j5c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B3, load3B3;\n",
+                j5c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B4, load3B4;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B5, load3B5;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B6, load3B6;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B7, load3B7;\n",
+
+                j2c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<0*16>], load0A0;\n",
+                j2c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*16>], load0A1;\n",
+                j2c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*16>], load0A2;\n",
+                j2c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*16>], load0A3;\n",
+                j2c24  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*16>], load0A4;\n",
+                j2c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*16>], load0A5;\n",
+                j2c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*16>], load0A6;\n",
+                j2c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<7*16>], load0A7;\n",
+
+                j3c16  => "04:-:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 0*16>], load0B0;\n",
+                j3c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 0*16>], load0B1;\n",
+                j3c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 0*16>], load0B2;\n",
+                j3c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 0*16>], load0B3;\n",
+                j3c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 0*16>], load0B4;\n",
+                j3c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 0*16>], load0B5;\n",
+                j3c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 0*16>], load0B6;\n",
+                j3c30  => "--:3:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 0*16>], load0B7;\n",
+
+                j4c16  => "08:-:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 1*16>], load1B0;\n",
+                j4c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 1*16>], load1B1;\n",
+                j4c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 1*16>], load1B2;\n",
+                j4c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 1*16>], load1B3;\n",
+                j4c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 1*16>], load1B4;\n",
+                j4c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 1*16>], load1B5;\n",
+                j4c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 1*16>], load1B6;\n",
+                j4c30  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 1*16>], load1B7;\n",
+
+                j5c16  => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 2*16>], load2B0;\n",
+                j5c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 2*16>], load2B1;\n",
+                j5c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 2*16>], load2B2;\n",
+                j5c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 2*16>], load2B3;\n",
+                j5c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 2*16>], load2B4;\n",
+                j5c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 2*16>], load2B5;\n",
+                j5c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 2*16>], load2B6;\n",
+                j5c30  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 2*16>], load2B7;\n",
+
+                j6c16  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 3*16>], load3B0;\n",
+                j6c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 3*16>], load3B1;\n",
+                j6c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 3*16>], load3B2;\n",
+                j6c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 3*16>], load3B3;\n",
+                j6c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 3*16>], load3B4;\n",
+                j6c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 3*16>], load3B5;\n",
+                j6c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 3*16>], load3B6;\n",
+                j6c30  => "--:6:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 3*16>], load3B7;\n",
+
+                j2c48 => "02:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j2c50 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j2c52 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j2c54 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+                j2c56 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n",
+                j2c58 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n",
+                j2c60 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n",
+                j2c62 => "--:-:2:-:1  \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n",
+
+                j3c48 => "04:-:-:-:1  \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c50 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j3c52 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j3c54 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+                j3c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];\n",
+                j3c58 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];\n",
+                j3c60 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];\n",
+                j3c62 => "--:-:3:-:1  \@P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];\n",
+
+                j4c48 => "08:-:-:-:1  \@P4 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j4c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j4c52 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j4c54 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+                j4c56 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];\n",
+                j4c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];\n",
+                j4c60 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];\n",
+                j4c62 => "--:-:4:-:1  \@P4 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];\n",
+
+                j5c48 => "10:-:-:-:1  \@P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j5c50 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j5c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j5c54 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+                j5c56 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B4, [track2B + 2x<4>];\n",
+                j5c58 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B5, [track2B + 2x<5>];\n",
+                j5c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B6, [track2B + 2x<6>];\n",
+                j5c62 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load2B7, [track2B + 2x<7>];\n",
+
+                j6c48 => "20:-:-:-:1  \@P6 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j6c50 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j6c52 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j6c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+                j6c56 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B4, [track3B + 2x<4>];\n",
+                j6c58 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B5, [track3B + 2x<5>];\n",
+                j6c60 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B6, [track3B + 2x<6>];\n",
+                j6c62 => "--:-:6:-:1  \@P6 LDG.E.CI.U16 load3B7, [track3B + 2x<7>];\n",
+            )
+        ),
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out = '';
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*16 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*16 + 08>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4
+--:-:-:-:1      LOP.AND tid15, tid, 15;
+--:-:-:-:1      SHR.U32 tid16, tid, 4;
+--:-:-:-:1      SHL     tid15, tid15, 2;
+--:-:-:-:1      ISCADD readCs, tid16, tid15, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*64 + tid15;
+--:-:-:-:1      ISCADD cx, blkB, tid15, 6;
+--:-:-:-:1      IADD   cx1, cx, 1;
+--:-:-:-:1      IADD   cx2, cx, 2;
+--:-:-:-:1      IADD   cx3, cx, 3;
+
+// cy = blkA*16 + tid16
+--:-:-:-:1      ISCADD cy, blkA, tid16, 4;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      SHL  ldc8, ldc, 4;
+
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C1,    ci, param_C[1], RZ, 1;
+
+// P0 = cx < n
+--:-:-:-:1      ISETP.LT.AND P0, PT, cx,  param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cx1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cx2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cx3, param_n, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+// P4 = cy < m
+--:-:-:-:1      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+// P5 = beta != 0 && P4
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P4;
+
+// P6 = Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+
+// Init beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:1:-:1  @P0 LDG.E.64 loadC, [C];
+    } : q{
+--:-:-:-:0 @!P0 MOV loadC0, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>];
+--:-:-:-:0 @!P1 MOV loadC1, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>];
+--:-:-:-:0 @!P2 MOV loadC2, RZ;
+--:-:-:-:1  @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>];
+--:-:-:-:0 @!P3 MOV loadC3, RZ;
+--:-:1:-:1  @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>];
+    };
++]
+
+// Restore output preds
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+
+--:-:-:-:1      LDS.U.128 part0C, [readCs + 4x<0*8*64>];
+--:-:2:-:1      LDS.U.128 part1C, [readCs + 4x<1*8*64>];
+--:-:-:-:1      LDS.U.128 part2C, [readCs + 4x<2*8*64>];
+--:-:3:-:1      LDS.U.128 part3C, [readCs + 4x<3*8*64>];
+--:-:-:-:1      LDS.U.128 part4C, [readCs + 4x<4*8*64>];
+--:-:4:-:1      LDS.U.128 part5C, [readCs + 4x<5*8*64>];
+--:-:-:-:1      LDS.U.128 part6C, [readCs + 4x<6*8*64>];
+--:-:5:-:1      LDS.U.128 part7C, [readCs + 4x<7*8*64>];
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1  @P0 FADD part0C0, part0C0, part1C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part1C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part1C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part1C3;
+
+04:-:-:-:1  @P0 FADD part2C0, part2C0, part3C0;
+--:-:-:-:1  @P1 FADD part2C1, part2C1, part3C1;
+--:-:-:-:1  @P2 FADD part2C2, part2C2, part3C2;
+--:-:-:-:1  @P3 FADD part2C3, part2C3, part3C3;
+
+08:-:-:-:1  @P0 FADD part4C0, part4C0, part5C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part5C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part5C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part5C3;
+
+10:-:-:-:1  @P0 FADD part6C0, part6C0, part7C0;
+--:-:-:-:1  @P1 FADD part6C1, part6C1, part7C1;
+--:-:-:-:1  @P2 FADD part6C2, part6C2, part7C2;
+--:-:-:-:1  @P3 FADD part6C3, part6C3, part7C3;
+
+--:-:-:-:1  @P0 FADD part0C0, part0C0, part2C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part2C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part2C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part2C3;
+
+--:-:-:-:1  @P0 FADD part4C0, part4C0, part6C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part6C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part6C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part6C3;
+
+--:-:-:-:1  @P0 FADD c0, part0C0, part4C0;
+--:-:-:-:1  @P1 FADD c1, part0C1, part4C1;
+--:-:-:-:1  @P2 FADD c2, part0C2, part4C2;
+--:-:-:-:1  @P3 FADD c3, part0C3, part4C3;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD cy, cy, 8;
+
+[+
+    our $vec;
+    return $vec ? q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0.H0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC0.H1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC1.H0;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC1.H1;
+    } : q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC2;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC3;
+    };
++]
+
+01:-:-:-:1  @P5 FFMA c0, b0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, b1, beta, c1;
+04:-:-:-:1  @P5 FFMA c2, b2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, b3, beta, c3;
+
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, cy, param_m, P5;
+
+--:-:1:-:1  @P0 F2F.F16.F32 c0, c0;
+--:-:2:-:1  @P1 F2F.F16.F32 c1, c1;
+
+--:-:-:-:0      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+--:-:3:-:1  @P2 F2F.F16.F32 c2, c2;
+--:-:4:-:1  @P3 F2F.F16.F32 c3, c3;
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:2  @P0 BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2  @P0 BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:1  @P0 STG.E.CG.64 [C], c;
+    } : q{
+01:-:-:-:1  @P0 STG.E.U16 [C + 2x<0>], c0;
+02:-:-:-:1  @P1 STG.E.U16 [C + 2x<1>], c1;
+04:-:-:-:1  @P2 STG.E.U16 [C + 2x<2>], c2;
+08:1:-:-:1  @P3 STG.E.U16 [C + 2x<3>], c3;
+    };
++]
+
+// Restore beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+01:-:-:-:6      IADD   C0.CC, C0, ldc8;
+--:-:-:-:0      IADD.X C1,    C1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nt_32x128.sass b/Kernel/SGEMM/Maxwell/hgemm_nt_32x128.sass
new file mode 100644
index 0000000..eef6e5e
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nt_32x128.sass
@@ -0,0 +1,588 @@
+# Kernel: hgemm_nt_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(128*16 + 32)*2 + (32*16 + 32)*2>
+    szShareA : (32*16 + 32)
+    szShareB : (128*16 + 32)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ tidX, lda, ldb, ldaz, ldbz, ldb32, tid1, tid3, tid96, ta, tb00, tb32, tb64, tb96, xmad_ta, xmad_tb, shiftX, tidY<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadA<0-3>
+      84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>
+
+    100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    110-120 ~ writeAs, writeBs, k, tidY, txa, txb00, txb32, txb64, txb96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+--:-:3:-:1      S2R blkA, SR_CTAID.Y;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb32, ldb, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidX   = tid >> 2
+// tidY   = (tid & 3) << 2
+// shiftX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidX, tid,  2;
+01:-:-:-:1      LOP.AND tid3, tid,  3;
+--:-:-:-:1      SHL     tidY, tid3, 2;
+--:-:-:-:1      SHL     shiftX, tid3, 3;
+
+// trackA += ((blkA*32 + tidX) * lda + tidAY) * 2
+04:-:-:-:1      ISCADD   txa, blkA, tidX, 5;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 1;
+
+// trackB += ((blkB*128 + tidX) * ldb + tidY) * 2
+02:-:-:-:1      ISCADD txb00, blkB, tidX, 7;
+--:-:-:-:1      IADD   txb32, txb00, 32;
+--:-:-:-:1      IADD   txb64, txb00, 64;
+--:-:-:-:1      IADD   txb96, txb00, 96;
+
+--:-:-:-:1      XMAD.LO  tb00, ldb,  txb00, tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb00, ldbz, blkZ, tb00;
+--:-:-:-:1      IADD     tb32, tb00, ldb32;
+--:-:-:-:1      IADD     tb64, tb32, ldb32;
+--:-:-:-:1      IADD     tb96, tb64, ldb32;
+
+--:-:-:-:1      LEA      track0B0.CC, tb00, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb00, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track1B0.CC, tb32, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb32, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track2B0.CC, tb64, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb64, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track3B0.CC, tb96, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb96, param_B[1], RZ, 1;
+
+// writeAs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeBs, writeBs, shiftX;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+01:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readBs, readBs, tid96;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb64, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb96, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa,   param_m, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.CI.64 load0B, [track0B];
+--:-:2:-:1  @P3 LDG.E.CI.64 load1B, [track1B];
+--:-:3:-:1  @P4 LDG.E.CI.64 load2B, [track2B];
+--:-:4:-:1  @P5 LDG.E.CI.64 load3B, [track3B];
+--:-:5:-:1  @P6 LDG.E.CI.64 loadA,  [trackA];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.64 load0B, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 load1B, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.64 load2B, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.64 load3B, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.64 loadA,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidY1, tidY, 1;
+--:-:-:-:1      IADD tidY2, tidY, 2;
+--:-:-:-:1      IADD tidY3, tidY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:1:-:1  @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:1:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:1:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb64, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb96, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];
+--:-:5:-:1  @P1 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];
+--:-:5:-:1  @P2 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb32, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P0, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P0;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+21:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:1:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+
+02:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+
+04:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:3:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:4:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+
+10:-:-:-:1      F2F.F32.F16 loadA3, loadA1.H1;
+--:-:-:-:1      F2F.F32.F16 loadA2, loadA1.H0;
+--:-:-:-:1      F2F.F32.F16 loadA1, loadA0.H1;
+--:-:5:-:1      F2F.F32.F16 loadA0, loadA0.H0;
+    } : q{
+21:-:-:-:1      F2F.F32.F16 load0B0, load0B0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:1:-:1      F2F.F32.F16 load0B3, load0B3;
+
+02:-:-:-:1      F2F.F32.F16 load1B0, load1B0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:2:-:1      F2F.F32.F16 load1B3, load1B3;
+
+04:-:-:-:1      F2F.F32.F16 load2B0, load2B0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:3:-:1      F2F.F32.F16 load2B3, load2B3;
+
+08:-:-:-:1      F2F.F32.F16 load3B0, load3B0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:4:-:1      F2F.F32.F16 load3B3, load3B3;
+
+10:-:-:-:1      F2F.F32.F16 loadA0, loadA0;
+--:-:-:-:1      F2F.F32.F16 loadA1, loadA1;
+--:-:-:-:1      F2F.F32.F16 loadA2, loadA2;
+--:-:5:-:1      F2F.F32.F16 loadA3, loadA3;
+    };
+</CODE>
+
+01:-:-:-:1      STS [writeBs + 4x<0*128 + 0*32>], load0B0;
+--:-:-:-:0      IADD   track0B0.CC, track0B0, 2x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 0*32>], load0B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 0*32>], load0B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 0*32>], load0B3;
+
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+02:-:-:-:1      STS [writeBs + 4x<0*128 + 1*32>], load1B0;
+--:-:-:-:0      IADD   track1B0.CC, track1B0, 2x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 1*32>], load1B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 1*32>], load1B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 1*32>], load1B3;
+
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+04:-:-:-:1      STS [writeBs + 4x<0*128 + 2*32>], load2B0;
+--:-:-:-:0      IADD   track2B0.CC, track2B0, 2x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 2*32>], load2B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 2*32>], load2B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 2*32>], load2B3;
+
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+08:-:-:-:1      STS [writeBs + 4x<0*128 + 3*32>], load3B0;
+--:-:-:-:0      IADD   track3B0.CC, track3B0, 2x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 3*32>], load3B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 3*32>], load3B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 3*32>], load3B3;
+
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+10:-:-:-:1      STS [writeAs + 4x<0*32>], loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*32>], loadA3;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.64 load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.64 load1B, [track1B];
+--:-:5:-:1  @P4 LDG.E.CI.64 load2B, [track2B];
+--:-:5:-:1  @P5 LDG.E.CI.64 load3B, [track3B];
+--:-:6:-:1  @P6 LDG.E.CI.64 loadA,  [trackA];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 1;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 0*32>], load0B0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 0*32>], load0B1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 0*32>], load0B2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 0*32>], load0B3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 1*32>], load1B0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 1*32>], load1B1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 1*32>], load1B2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 1*32>], load1B3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 2*32>], load2B0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 2*32>], load2B1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 2*32>], load2B2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 2*32>], load2B3;\n",
+
+        j9c6   => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 3*32>], load3B0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 3*32>], load3B1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 3*32>], load3B2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 3*32>], load3B3;\n",
+
+        j11c6  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<0*32>], loadA0;\n",
+        j11c8  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], loadA1;\n",
+        j11c10 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], loadA2;\n",
+        j11c12 => "--:6:-:-:1  \@P0 STS [writeAs + 4x<3*32>], loadA3;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0B0.CC, track0B0, 2x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0B1,    track0B1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, 2x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2B0.CC, track2B0, 2x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2B1,    track2B1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3B0.CC, track3B0, 2x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3B1,    track3B1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackA0.CC, trackA0, 2x<16>;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.64 load0B, [track0B];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.64 load1B, [track1B];\n",
+                j9c29  => "10:-:5:-:1  \@P4 LDG.E.CI.64 load2B, [track2B];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.64 load3B, [track3B];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.64 loadA,  [trackA];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j6c13  => "10:-:-:-:1  \@P4 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j6c17  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j6c21  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j6c25  => "--:-:5:-:1  \@P4 F2F.F32.F16 load2B0, load2B0.H0;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadA3, loadA1.H1;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA2, loadA1.H0;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA1, loadA0.H1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA0, loadA0.H0;\n",
+            ) :
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c31  => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j4c1   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j5c31  => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j6c1   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+
+                j9c29  => "10:-:5:-:1  \@P4 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j9c31  => "--:-:5:-:1  \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j10c1  => "--:-:5:-:1  \@P4 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j10c3  => "--:-:5:-:1  \@P4 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+
+                j10c8  => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j10c10 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j10c12 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];\n",
+                j11c31 => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];\n",
+                j12c1  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0B3, load0B3;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1B0, load1B0;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B1, load1B1;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B2, load1B2;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1B3, load1B3;\n",
+
+                j6c13  => "10:-:-:-:1  \@P4 F2F.F32.F16 load2B0, load2B0;\n",
+                j6c17  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2B1, load2B1;\n",
+                j6c21  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2B2, load2B2;\n",
+                j6c25  => "--:-:5:-:1  \@P4 F2F.F32.F16 load2B3, load2B3;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B0, load3B0;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B1, load3B1;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B2, load3B2;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3B3, load3B3;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadA0, loadA0;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA1, loadA1;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA2, loadA2;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA3, loadA3;\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="hgemm_common_32x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/hgemm_nt_32x32.sass b/Kernel/SGEMM/Maxwell/hgemm_nt_32x32.sass
new file mode 100644
index 0000000..1225d7d
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_nt_32x32.sass
@@ -0,0 +1,1067 @@
+# Kernel: hgemm_nt_32x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 16x<32*65>
+    szShareA   : (32*65)
+    szShareB   : (32*65)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+      64-79 : j0Ay<0-7>, j0Bx<0-7>
+      80-95 : j1Ay<0-7>, j1Bx<0-7>
+
+      64-95 ~ tidX, tidY, tidY<1-3>, lda, ldb, ldaz, ldbz, lda16, ldb16, tid1, tid16, tid16_8, ta<00|16>, txa<00|16>, tb<00|16>, txb<00|16>, xmad_ta, xmad_tb, shiftX, predsY0, predsY4, partialK
+
+     96-127 :  load0A<0-7>,  load1A<0-7>,  load0B<0-7>,  load1B<0-7>
+    128-135 : track0A<0-1>, track1A<0-1>, track0B<0-1>, track1B<0-1>
+
+    136-142 ~ swapBuf, readAs, readBs, writeAs, writeBs, k
+    143-149 ~ tid, blkA, blkB, blkZ, writeCs, preds
+
+       0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3>
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+      96-99 : loadC<0-3>
+    100-103 : b<0-3>
+    104-107 : c<0-3>
+    108-109 : C<0-1>
+    110-142 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc16, readCs, alpha, beta, flags, tid7, tid8
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL lda16, lda, 4;
+--:-:-:-:1      SHL ldb16, ldb, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// tidX   = tid >> 3
+// tidY   = (tid & 7) << 3
+// shiftX = (tid & 7) << 2
+01:-:-:-:1      SHR.U32 tidX, tid,  3;
+--:-:-:-:1      LOP.AND tidY, tid,  7;
+--:-:-:-:1      SHL     shiftX, tidY, 2;
+--:-:-:-:1      SHL     tidY,   tidY, 3;
+
+// trackA += ((blkA*32 + tidX) * lda + tidY) * 2
+02:-:-:-:1      ISCADD   txa00, blkA, tidX, 5;
+--:-:-:-:1      IADD     txa16, txa00, 16;
+--:-:-:-:1      XMAD.LO  ta00, lda,  txa00, tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta00, ldaz, blkZ, ta00;
+--:-:-:-:1      IADD     ta16, ta00, lda16;
+--:-:-:-:1      LEA      track0A0.CC, ta00, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta00, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track1A0.CC, ta16, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track1A1,    ta16, param_A[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa16, param_m, PT;
+
+// trackB += ((blkB*32 + tidX) * ldb + tidY) * 2
+04:-:-:-:1      ISCADD   txb00, blkB, tidX, 5;
+--:-:-:-:1      IADD     txb16, txb00, 16;
+--:-:-:-:1      XMAD.LO  tb00, ldb,  txb00, tidY, xmad_tb;
+--:-:-:-:1      XMAD.LO2 tb00, ldbz, blkZ, tb00;
+--:-:-:-:1      IADD     tb16, tb00, ldb16;
+--:-:-:-:1      LEA      track0B0.CC, tb00, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb00, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track1B0.CC, tb16, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb16, param_B[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb16, param_n, PT;
+
+--:-:-:-:1      P2R preds, PR, RZ, 0x3c;
+
+// writeAs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftX;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeBs, writeBs, shiftX;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA>, 2;
+
+
+// readAs = (((tid & 8) >> 2) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    8;
+--:-:-:-:1      SHR.U32 readAs, readAs, 2;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid >> 1) & 3) << 4
+--:-:-:-:1      BFE.U32 readBs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      SHL     readBs, readBs, 4;
+
+// tid16 = tid & -16
+// tid16_8 = tid16 / 2 * 4
+--:-:-:-:1      LOP.AND tid16, tid, -16;
+--:-:-:-:1      SHL     tid16_8, tid16, 1;
+
+// writeCs = (readAs + tid16*4) * 32 + readBs;
+--:-:-:-:1      ISCADD writeCs, tid16,   readAs, 2;
+--:-:-:-:1      ISCADD writeCs, writeCs, readBs, 5;
+
+// Each block of 16 threads works on 8 lines, shifted over by 4
+// readAs += tid16_8 * 32 + tid16
+// readBs += tid16_8 * 32 + tid16 + 4x<szShareA>
+--:-:-:-:1      ISCADD readAs, tid16_8, readAs, 5;
+--:-:-:-:1      ISCADD readBs, tid16_8, readBs, 5;
+--:-:-:-:1      IADD   readAs, tid16, readAs;
+--:-:-:-:1      IADD3  readBs, tid16, 4x<szShareA>, readBs;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareA + szShareB>;
+
+// If k is not a multiple of 64 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 64 then make a full 64 line fetch.
+--:-:-:-:1      LOP.AND.Z P0, partialK, k, 63;
+--:-:-:-:1  @P0 MOV partialK, 64;
+--:-:-:-:1      IADD k, k, -partialK;
+[+
+    our $vec;
+    return $vec ? q{
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x3c;
+--:-:-:-:1 @!P1 R2P PR, RZ, 0x3c;
+
+<ORDERED>
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.128 load1A, [track1A];
+--:-:4:-:1  @P4 LDG.E.CI.128 load0B, [track0B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load1B, [track1B];
+</ORDERED>
+
+<ORDERED>
+--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.128 load1A, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128 load0B, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 load1B, [addr_zero];
+</ORDERED>
+
+    } : q{
+--:-:-:-:1      IADD tidY1, tidY, 1;
+--:-:-:-:1      IADD tidY2, tidY, 2;
+--:-:-:-:1      IADD tidY3, tidY, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, partialK, PT;
+--:-:-:-:1      P2R predsY0, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD tidY,  tidY,  4;
+--:-:-:-:1      IADD tidY1, tidY1, 4;
+--:-:-:-:1      IADD tidY2, tidY2, 4;
+--:-:-:-:1      IADD tidY3, tidY3, 4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, partialK, PT;
+--:-:-:-:1      P2R predsY4, PR, RZ, 0x0f;
+
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1  @P4 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1  @P4 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A4, RZ;
+--:-:-:-:1 @!P1 MOV load0A5, RZ;
+--:-:-:-:1 @!P2 MOV load0A6, RZ;
+--:-:-:-:1 @!P3 MOV load0A7, RZ;
+
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa16, param_m, PT;
+--:-:-:-:1  @P5 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1  @P5 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1A4, [track1A + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1A5, [track1A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1A6, [track1A + 2x<6>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A4, RZ;
+--:-:-:-:1 @!P1 MOV load1A5, RZ;
+--:-:-:-:1 @!P2 MOV load1A6, RZ;
+--:-:-:-:1 @!P3 MOV load1A7, RZ;
+
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb00, param_n, PT;
+--:-:-:-:1  @P6 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1  @P6 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B4, RZ;
+--:-:-:-:1 @!P1 MOV load0B5, RZ;
+--:-:-:-:1 @!P2 MOV load0B6, RZ;
+--:-:-:-:1 @!P3 MOV load0B7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb16, param_n, PT;
+--:-:-:-:1  @P4 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1  @P4 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B4, RZ;
+--:-:-:-:1 @!P1 MOV load1B5, RZ;
+--:-:-:-:1 @!P2 MOV load1B6, RZ;
+--:-:-:-:1 @!P3 MOV load1B7, RZ;
+    };
++]
+--:-:-:-:1      SHL partialK, partialK, 1;
+
+--:-:-:-:1      ISETP.GE.AND P0, PT, k, 64, PT;
+--:-:-:-:1      IADD k, k, -64;
+--:-:-:-:1  @P0 R2P PR, preds, 0x3c;
+--:-:-:-:1 @!P0 R2P PR, RZ, 0x3c;
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+22:-:-:-:1      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:6:-:1      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+    } : q{
+02:-:-:-:1      F2F.F32.F16 load0A7, load0A7;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A6;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A5;
+--:-:6:-:1      F2F.F32.F16 load0A4, load0A4;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A3;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0;
+    };
++]
+--:-:-:-:0      IADD   track0A0.CC, track0A0, partialK;
+20:-:-:-:1      STS [writeAs + 4x<7*32 + 0*16>], load0A7;
+--:-:-:-:1      STS [writeAs + 4x<6*32 + 0*16>], load0A6;
+--:-:-:-:1      STS [writeAs + 4x<5*32 + 0*16>], load0A5;
+--:-:-:-:1      STS [writeAs + 4x<4*32 + 0*16>], load0A4;
+02:-:-:-:1      STS [writeAs + 4x<3*32 + 0*16>], load0A3;
+--:-:-:-:1      STS [writeAs + 4x<2*32 + 0*16>], load0A2;
+--:-:-:-:1      STS [writeAs + 4x<1*32 + 0*16>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<0*32 + 0*16>], load0A0;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+04:-:-:-:1      F2F.F32.F16 load1A7, load1A3.H1;
+--:-:-:-:1      F2F.F32.F16 load1A6, load1A3.H0;
+--:-:-:-:1      F2F.F32.F16 load1A5, load1A2.H1;
+--:-:6:-:1      F2F.F32.F16 load1A4, load1A2.H0;
+--:-:-:-:1      F2F.F32.F16 load1A3, load1A1.H1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A1.H0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A0.H1;
+--:-:2:-:1      F2F.F32.F16 load1A0, load1A0.H0;
+    } : q{
+04:-:-:-:1      F2F.F32.F16 load1A7, load1A7;
+--:-:-:-:1      F2F.F32.F16 load1A6, load1A6;
+--:-:-:-:1      F2F.F32.F16 load1A5, load1A5;
+--:-:6:-:1      F2F.F32.F16 load1A4, load1A4;
+--:-:-:-:1      F2F.F32.F16 load1A3, load1A3;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A2;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A1;
+--:-:2:-:1      F2F.F32.F16 load1A0, load1A0;
+    };
++]
+--:-:-:-:0      IADD   track1A0.CC, track1A0, partialK;
+20:-:-:-:1      STS [writeAs + 4x<7*32 + 1*16>], load1A7;
+--:-:-:-:1      STS [writeAs + 4x<6*32 + 1*16>], load1A6;
+--:-:-:-:1      STS [writeAs + 4x<5*32 + 1*16>], load1A5;
+--:-:-:-:1      STS [writeAs + 4x<4*32 + 1*16>], load1A4;
+02:-:-:-:1      STS [writeAs + 4x<3*32 + 1*16>], load1A3;
+--:-:-:-:1      STS [writeAs + 4x<2*32 + 1*16>], load1A2;
+--:-:-:-:1      STS [writeAs + 4x<1*32 + 1*16>], load1A1;
+--:-:-:-:1      STS [writeAs + 4x<0*32 + 1*16>], load1A0;
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+08:-:-:-:1      F2F.F32.F16 load0B7, load0B3.H1;
+--:-:-:-:1      F2F.F32.F16 load0B6, load0B3.H0;
+--:-:-:-:1      F2F.F32.F16 load0B5, load0B2.H1;
+--:-:6:-:1      F2F.F32.F16 load0B4, load0B2.H0;
+--:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:2:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+    } : q{
+08:-:-:-:1      F2F.F32.F16 load0B7, load0B7;
+--:-:-:-:1      F2F.F32.F16 load0B6, load0B6;
+--:-:-:-:1      F2F.F32.F16 load0B5, load0B5;
+--:-:6:-:1      F2F.F32.F16 load0B4, load0B4;
+--:-:-:-:1      F2F.F32.F16 load0B3, load0B3;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:2:-:1      F2F.F32.F16 load0B0, load0B0;
+    };
++]
+--:-:-:-:0      IADD   track0B0.CC, track0B0, partialK;
+20:-:-:-:1      STS [writeBs + 4x<7*32 + 0*16>], load0B7;
+--:-:-:-:1      STS [writeBs + 4x<6*32 + 0*16>], load0B6;
+--:-:-:-:1      STS [writeBs + 4x<5*32 + 0*16>], load0B5;
+--:-:-:-:1      STS [writeBs + 4x<4*32 + 0*16>], load0B4;
+02:-:-:-:1      STS [writeBs + 4x<3*32 + 0*16>], load0B3;
+--:-:-:-:1      STS [writeBs + 4x<2*32 + 0*16>], load0B2;
+--:-:-:-:1      STS [writeBs + 4x<1*32 + 0*16>], load0B1;
+--:-:-:-:1      STS [writeBs + 4x<0*32 + 0*16>], load0B0;
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+10:-:-:-:1      F2F.F32.F16 load1B7, load1B3.H1;
+--:-:-:-:1      F2F.F32.F16 load1B6, load1B3.H0;
+--:-:-:-:1      F2F.F32.F16 load1B5, load1B2.H1;
+--:-:6:-:1      F2F.F32.F16 load1B4, load1B2.H0;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+    } : q{
+10:-:-:-:1      F2F.F32.F16 load1B7, load1B7;
+--:-:-:-:1      F2F.F32.F16 load1B6, load1B6;
+--:-:-:-:1      F2F.F32.F16 load1B5, load1B5;
+--:-:6:-:1      F2F.F32.F16 load1B4, load1B4;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B3;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0;
+    };
++]
+--:-:-:-:0      IADD   track1B0.CC, track1B0, partialK;
+20:-:-:-:1      STS [writeBs + 4x<7*32 + 1*16>], load1B7;
+--:-:-:-:1      STS [writeBs + 4x<6*32 + 1*16>], load1B6;
+--:-:-:-:1      STS [writeBs + 4x<5*32 + 1*16>], load1B5;
+--:-:-:-:1      STS [writeBs + 4x<4*32 + 1*16>], load1B4;
+02:-:-:-:1      STS [writeBs + 4x<3*32 + 1*16>], load1B3;
+--:-:-:-:1      STS [writeBs + 4x<2*32 + 1*16>], load1B2;
+--:-:-:-:1      STS [writeBs + 4x<1*32 + 1*16>], load1B1;
+--:-:-:-:1      STS [writeBs + 4x<0*32 + 1*16>], load1B0;
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*32 + 16>];
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.128 load1A, [track1A];
+--:-:4:-:1  @P4 LDG.E.CI.128 load0B, [track0B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load1B, [track1B];
+    } : q{
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A4, [track1A + 2x<4>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A5, [track1A + 2x<5>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A6, [track1A + 2x<6>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>];
+
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];
+--:-:4:-:1  @P4 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];
+
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];
+    };
++]
+
+LOOP:
+
+[+
+    our $vec;
+    our %insert =
+    (
+        j0c8   => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, RZ, PT;\n",
+        j0c10  => "--:-:-:-:1      ISETP.GE.AND P1, PT, k, 64, PT;\n" .
+                  "--:-:-:-:1      IADD k, k, -64;\n",
+
+        j0c23  => "--:-:-:-:1  \@P1 R2P PR, preds, 0x3c;\n",
+        j0c24  => "--:-:-:-:1 \@!P1 R2P PR, RZ,    0x3c;\n",
+
+        j3c32  => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<64>;\n",
+        j3c37  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j4c32  => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 2x<64>;\n",
+        j4c37  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j5c32  => "--:-:-:-:1  \@P4 IADD   track0B0.CC, track0B0, 2x<64>;\n",
+        j5c37  => "--:-:-:-:1  \@P4 IADD.X track0B1,    track0B1, RZ;\n",
+        j6c32  => "--:-:-:-:1  \@P5 IADD   track1B0.CC, track1B0, 2x<64>;\n",
+        j6c37  => "--:-:-:-:1  \@P5 IADD.X track1B1,    track1B1, RZ;\n",
+
+        j6c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j2c45 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n",
+                j2c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n",
+                j2c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n",
+                j2c57 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n",
+                j2c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j3c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j3c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j3c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j3c45 => "04:-:-:-:1  \@P0 F2F.F32.F16 load1A7, load1A3.H1;\n",
+                j3c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A6, load1A3.H0;\n",
+                j3c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A5, load1A2.H1;\n",
+                j3c57 => "--:-:3:-:1  \@P0 F2F.F32.F16 load1A4, load1A2.H0;\n",
+                j3c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A3, load1A1.H1;\n",
+                j4c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A2, load1A1.H0;\n",
+                j4c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A1, load1A0.H1;\n",
+                j4c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load1A0, load1A0.H0;\n",
+
+                j4c45 => "08:-:-:-:1  \@P0 F2F.F32.F16 load0B7, load0B3.H1;\n",
+                j4c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B6, load0B3.H0;\n",
+                j4c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B5, load0B2.H1;\n",
+                j4c57 => "--:-:4:-:1  \@P0 F2F.F32.F16 load0B4, load0B2.H0;\n",
+                j4c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j5c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j5c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j5c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j5c45 => "10:-:-:-:1  \@P0 F2F.F32.F16 load1B7, load1B3.H1;\n",
+                j5c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B6, load1B3.H0;\n",
+                j5c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B5, load1B2.H1;\n",
+                j5c57 => "--:-:5:-:1  \@P0 F2F.F32.F16 load1B4, load1B2.H0;\n",
+                j5c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j6c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j6c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j6c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j3c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<7*32 + 0*16>], load0A7;\n",
+                j3c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32 + 0*16>], load0A6;\n",
+                j3c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32 + 0*16>], load0A5;\n",
+                j3c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*32 + 0*16>], load0A4;\n",
+                j3c24  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<3*32 + 0*16>], load0A3;\n",
+                j3c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32 + 0*16>], load0A2;\n",
+                j3c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32 + 0*16>], load0A1;\n",
+                j3c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<0*32 + 0*16>], load0A0;\n",
+
+                j4c16  => "04:-:-:-:1  \@P0 STS [writeAs + 4x<7*32 + 1*16>], load1A7;\n",
+                j4c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32 + 1*16>], load1A6;\n",
+                j4c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32 + 1*16>], load1A5;\n",
+                j4c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*32 + 1*16>], load1A4;\n",
+                j4c24  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<3*32 + 1*16>], load1A3;\n",
+                j4c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32 + 1*16>], load1A2;\n",
+                j4c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32 + 1*16>], load1A1;\n",
+                j4c30  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<0*32 + 1*16>], load1A0;\n",
+
+                j5c16  => "08:-:-:-:1  \@P0 STS [writeBs + 4x<7*32 + 0*16>], load0B7;\n",
+                j5c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*32 + 0*16>], load0B6;\n",
+                j5c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*32 + 0*16>], load0B5;\n",
+                j5c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*32 + 0*16>], load0B4;\n",
+                j5c24  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<3*32 + 0*16>], load0B3;\n",
+                j5c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*32 + 0*16>], load0B2;\n",
+                j5c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*32 + 0*16>], load0B1;\n",
+                j5c30  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<0*32 + 0*16>], load0B0;\n",
+
+                j6c16  => "10:-:-:-:1  \@P0 STS [writeBs + 4x<7*32 + 1*16>], load1B7;\n",
+                j6c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*32 + 1*16>], load1B6;\n",
+                j6c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*32 + 1*16>], load1B5;\n",
+                j6c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*32 + 1*16>], load1B4;\n",
+                j6c24  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<3*32 + 1*16>], load1B3;\n",
+                j6c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*32 + 1*16>], load1B2;\n",
+                j6c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*32 + 1*16>], load1B1;\n",
+                j6c30  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<0*32 + 1*16>], load1B0;\n",
+
+                j3c62 => "02:-:2:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j4c62 => "04:-:3:-:1  \@P3 LDG.E.CI.128 load1A, [track1A];\n",
+                j5c62 => "08:-:4:-:1  \@P4 LDG.E.CI.128 load0B, [track0B];\n",
+                j6c62 => "10:-:5:-:1  \@P5 LDG.E.CI.128 load1B, [track1B];\n",
+            ) :
+            (
+                j2c45 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A0, load0A0;\n",
+                j2c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A1;\n",
+                j2c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A2;\n",
+                j2c57 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A3, load0A3;\n",
+                j2c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A4;\n",
+                j3c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A5;\n",
+                j3c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A6;\n",
+                j3c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load0A7, load0A7;\n",
+
+                j3c45 => "04:-:-:-:1  \@P0 F2F.F32.F16 load1A0, load1A0;\n",
+                j3c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A1, load1A1;\n",
+                j3c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A2, load1A2;\n",
+                j3c57 => "--:-:3:-:1  \@P0 F2F.F32.F16 load1A3, load1A3;\n",
+                j3c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A4, load1A4;\n",
+                j4c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A5, load1A5;\n",
+                j4c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A6, load1A6;\n",
+                j4c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load1A7, load1A7;\n",
+
+                j4c45 => "08:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0;\n",
+                j4c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B1;\n",
+                j4c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B2;\n",
+                j4c57 => "--:-:4:-:1  \@P0 F2F.F32.F16 load0B3, load0B3;\n",
+                j4c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B4, load0B4;\n",
+                j5c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B5, load0B5;\n",
+                j5c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B6, load0B6;\n",
+                j5c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load0B7, load0B7;\n",
+
+                j5c45 => "10:-:-:-:1  \@P0 F2F.F32.F16 load1B0, load1B0;\n",
+                j5c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B1;\n",
+                j5c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B2;\n",
+                j5c57 => "--:-:5:-:1  \@P0 F2F.F32.F16 load1B3, load1B3;\n",
+                j5c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B4, load1B4;\n",
+                j6c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B5, load1B5;\n",
+                j6c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B6, load1B6;\n",
+                j6c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load1B7, load1B7;\n",
+
+                j3c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<0*32 + 0*16>], load0A0;\n",
+                j3c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32 + 0*16>], load0A1;\n",
+                j3c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32 + 0*16>], load0A2;\n",
+                j3c22  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<3*32 + 0*16>], load0A3;\n",
+                j3c24  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<4*32 + 0*16>], load0A4;\n",
+                j3c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32 + 0*16>], load0A5;\n",
+                j3c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32 + 0*16>], load0A6;\n",
+                j3c30  => "--:6:-:-:1  \@P0 STS [writeAs + 4x<7*32 + 0*16>], load0A7;\n",
+
+                j4c16  => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*32 + 1*16>], load1A0;\n",
+                j4c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32 + 1*16>], load1A1;\n",
+                j4c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32 + 1*16>], load1A2;\n",
+                j4c22  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<3*32 + 1*16>], load1A3;\n",
+                j4c24  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<4*32 + 1*16>], load1A4;\n",
+                j4c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32 + 1*16>], load1A5;\n",
+                j4c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32 + 1*16>], load1A6;\n",
+                j4c30  => "--:6:-:-:1  \@P0 STS [writeAs + 4x<7*32 + 1*16>], load1A7;\n",
+
+                j5c16  => "08:-:-:-:1  \@P0 STS [writeBs + 4x<0*32 + 0*16>], load0B0;\n",
+                j5c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*32 + 0*16>], load0B1;\n",
+                j5c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*32 + 0*16>], load0B2;\n",
+                j5c22  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<3*32 + 0*16>], load0B3;\n",
+                j5c24  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<4*32 + 0*16>], load0B4;\n",
+                j5c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*32 + 0*16>], load0B5;\n",
+                j5c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*32 + 0*16>], load0B6;\n",
+                j5c30  => "--:6:-:-:1  \@P0 STS [writeBs + 4x<7*32 + 0*16>], load0B7;\n",
+
+                j6c16  => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*32 + 1*16>], load1B0;\n",
+                j6c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*32 + 1*16>], load1B1;\n",
+                j6c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*32 + 1*16>], load1B2;\n",
+                j6c22  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<3*32 + 1*16>], load1B3;\n",
+                j6c24  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<4*32 + 1*16>], load1B4;\n",
+                j6c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*32 + 1*16>], load1B5;\n",
+                j6c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*32 + 1*16>], load1B6;\n",
+                j6c30  => "--:6:-:-:1  \@P0 STS [writeBs + 4x<7*32 + 1*16>], load1B7;\n",
+
+                j3c48 => "02:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j3c50 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j3c52 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j3c54 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+                j3c56 => "20:-:-:-:1  \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n",
+                j3c58 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n",
+                j3c60 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n",
+                j3c62 => "--:-:2:-:1  \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n",
+
+                j4c48 => "04:-:-:-:1  \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n",
+                j4c50 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n",
+                j4c52 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n",
+                j4c54 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n",
+                j4c56 => "20:-:-:-:1  \@P3 LDG.E.CI.U16 load1A4, [track1A + 2x<4>];\n",
+                j4c58 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A5, [track1A + 2x<5>];\n",
+                j4c60 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A6, [track1A + 2x<6>];\n",
+                j4c62 => "--:-:3:-:1  \@P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>];\n",
+
+                j5c48 => "08:-:-:-:1  \@P4 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j5c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j5c52 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j5c54 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+                j5c56 => "20:-:-:-:1  \@P4 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];\n",
+                j5c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];\n",
+                j5c60 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];\n",
+                j5c62 => "--:-:4:-:1  \@P4 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];\n",
+
+                j6c48 => "10:-:-:-:1  \@P5 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j6c50 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j6c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j6c54 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+                j6c56 => "20:-:-:-:1  \@P5 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];\n",
+                j6c58 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];\n",
+                j6c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];\n",
+                j6c62 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];\n",
+            )
+        ),
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out = '';
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// readCs = ((tid & 7) * 4 + (tid / 8) * 32) * 4
+--:-:-:-:1      LOP.AND tid7, tid, 7;
+--:-:-:-:1      SHR.U32 tid8, tid, 3;
+--:-:-:-:1      SHL     tid7, tid7, 2;
+--:-:-:-:1      ISCADD readCs, tid8, tid7, 5;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid7;
+--:-:-:-:1      ISCADD cx, blkB, tid7, 5;
+--:-:-:-:1      IADD   cx1, cx, 1;
+--:-:-:-:1      IADD   cx2, cx, 2;
+--:-:-:-:1      IADD   cx3, cx, 3;
+
+// cy = blkA*32 + tid8
+--:-:-:-:1      ISCADD cy, blkA, tid8, 5;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      SHL  ldc16, ldc, 5;
+
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C1,    ci, param_C[1], RZ, 1;
+
+// P0 = cx < n
+--:-:-:-:1      ISETP.LT.AND P0, PT, cx,  param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cx1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cx2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cx3, param_n, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+// P4 = cy < m
+--:-:-:-:1      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+// P5 = beta != 0 && P4
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P4;
+
+// P6 = Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+
+// Init beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32 + 16>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*32 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*32 + 16>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*32 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*32 + 16>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32 + 16>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32 + 16>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*32 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*32 + 16>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*32 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*32 + 16>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:1:-:1  @P0 LDG.E.64 loadC, [C];
+    } : q{
+--:-:-:-:0 @!P0 MOV loadC0, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>];
+--:-:-:-:0 @!P1 MOV loadC1, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>];
+--:-:-:-:0 @!P2 MOV loadC2, RZ;
+--:-:-:-:1  @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>];
+--:-:-:-:0 @!P3 MOV loadC3, RZ;
+--:-:1:-:1  @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>];
+    };
++]
+
+// Restore output preds
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+
+--:-:-:-:1      LDS.U.128 part0C, [readCs + 4x<0*16*32>];
+--:-:2:-:1      LDS.U.128 part1C, [readCs + 4x<1*16*32>];
+--:-:-:-:1      LDS.U.128 part2C, [readCs + 4x<2*16*32>];
+--:-:3:-:1      LDS.U.128 part3C, [readCs + 4x<3*16*32>];
+--:-:-:-:1      LDS.U.128 part4C, [readCs + 4x<4*16*32>];
+--:-:4:-:1      LDS.U.128 part5C, [readCs + 4x<5*16*32>];
+--:-:-:-:1      LDS.U.128 part6C, [readCs + 4x<6*16*32>];
+--:-:5:-:1      LDS.U.128 part7C, [readCs + 4x<7*16*32>];
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1  @P0 FADD part0C0, part0C0, part1C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part1C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part1C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part1C3;
+
+04:-:-:-:1  @P0 FADD part2C0, part2C0, part3C0;
+--:-:-:-:1  @P1 FADD part2C1, part2C1, part3C1;
+--:-:-:-:1  @P2 FADD part2C2, part2C2, part3C2;
+--:-:-:-:1  @P3 FADD part2C3, part2C3, part3C3;
+
+08:-:-:-:1  @P0 FADD part4C0, part4C0, part5C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part5C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part5C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part5C3;
+
+10:-:-:-:1  @P0 FADD part6C0, part6C0, part7C0;
+--:-:-:-:1  @P1 FADD part6C1, part6C1, part7C1;
+--:-:-:-:1  @P2 FADD part6C2, part6C2, part7C2;
+--:-:-:-:1  @P3 FADD part6C3, part6C3, part7C3;
+
+--:-:-:-:1  @P0 FADD part0C0, part0C0, part2C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part2C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part2C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part2C3;
+
+--:-:-:-:1  @P0 FADD part4C0, part4C0, part6C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part6C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part6C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part6C3;
+
+--:-:-:-:1  @P0 FADD c0, part0C0, part4C0;
+--:-:-:-:1  @P1 FADD c1, part0C1, part4C1;
+--:-:-:-:1  @P2 FADD c2, part0C2, part4C2;
+--:-:-:-:1  @P3 FADD c3, part0C3, part4C3;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD cy, cy, 16;
+
+[+
+    our $vec;
+    return $vec ? q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0.H0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC0.H1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC1.H0;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC1.H1;
+    } : q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC2;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC3;
+    };
++]
+
+01:-:-:-:1  @P5 FFMA c0, b0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, b1, beta, c1;
+04:-:-:-:1  @P5 FFMA c2, b2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, b3, beta, c3;
+
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, cy, param_m, P5;
+
+--:-:1:-:1  @P0 F2F.F16.F32 c0, c0;
+--:-:2:-:1  @P1 F2F.F16.F32 c1, c1;
+
+--:-:-:-:0      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+--:-:3:-:1  @P2 F2F.F16.F32 c2, c2;
+--:-:4:-:1  @P3 F2F.F16.F32 c3, c3;
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:2  @P0 BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2  @P0 BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:1  @P0 STG.E.CG.64 [C], c;
+    } : q{
+01:-:-:-:1  @P0 STG.E.U16 [C + 2x<0>], c0;
+02:-:-:-:1  @P1 STG.E.U16 [C + 2x<1>], c1;
+04:-:-:-:1  @P2 STG.E.U16 [C + 2x<2>], c2;
+08:1:-:-:1  @P3 STG.E.U16 [C + 2x<3>], c3;
+    };
++]
+
+// Restore beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+01:-:-:-:6      IADD   C0.CC, C0, ldc16;
+--:-:-:-:0      IADD.X C1,    C1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/hgemm_tn_128x128.sass b/Kernel/SGEMM/Maxwell/hgemm_tn_128x128.sass
new file mode 100644
index 0000000..c2beee1
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_tn_128x128.sass
@@ -0,0 +1,360 @@
+# Kernel: hgemm_tn_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+[-
+
+our $int16;
+
+our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+
+sub convert_in {return $convert;}
+
+
+sub int16_params {
+    return $int16 ? q{
+param_Stats[0]  : c[0x0][0x190]
+param_Stats[1]  : c[0x0][0x194]
+param_scale     : c[0x0][0x198]
+    } : "";
+}
+-]
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+    [+ int16_params() +]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ lda, ldb, ldaz, ldbz, tid1, tid7, tid31, tid128, tid15, tidX, blk, x<1-3>, y<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-103  : loadA<0-3>, loadB<0-3>
+
+    104-107 : trackA<0-1>, trackB<0-1>
+
+    108-118 ~ writeS, k, txa, txb, tidY, ta, tb, loop
+    119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ
+
+    64-75   ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128
+
+    64-79   : c<0-7>, d3, d2, d1, d0, cs<0-3>
+    64-65   : Stats<0-1>
+    80-89   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    90-118  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV loop, RZ;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
++]
+
+--:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+
+// tidX = (tid & 31) << 2
+// tidY = (tid >> 5) & 7
+01:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      SHL     tidX,   tid31, 2;
+--:-:-:-:1      BFE.U32 tidY,   tid,  0x305; // 3 bits at position 5
+
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 4;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// trackA += (blkA*128 + lda*tidY + tidX) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// trackB += (blkB*128 + ldb*tidY + tidX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 7;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      ISCADD  writeS, writeS, 4x<128*8*2>, 2;
+
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+
+// doLoad = tidY < k && txa|txb < n|m
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P6;
+
+--:-:2:-:1  @P2 LDG.E.CI.64 loadA, [trackA];
+--:-:3:-:1  @P3 LDG.E.CI.64 loadB, [trackB];
+
+--:-:5:-:1 @!P2 LDS.U.64 loadA, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 loadB, [addr_zero];
+
+    // Vec 4 and scalar loads
+    } : q{
+
+// doLoadA = tidY < k && txa < m
+// doLoadB = tidY < k && txb < n
+--:-:-:-:1      IADD x1, txa, 1;
+--:-:-:-:1      IADD x2, txa, 2;
+--:-:-:-:1      IADD x3, txa, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_m, P0;
+
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<00 + 0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<00 + 1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<00 + 2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<00 + 3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      IADD y1, txb, 1;
+--:-:-:-:1      IADD y2, txb, 2;
+--:-:-:-:1      IADD y3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_n, P0;
+
+--:-:3:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>];
+--:-:3:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>];
+--:-:3:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>];
+--:-:3:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+    };
++]
+
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    our $convert;
+    return $vec ? qq{
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 7;
+
+12:-:-:-:4      $convert loadA3, loadA1.H1;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:4      $convert loadA2, loadA1.H0;
+--:-:-:-:4      $convert loadA1, loadA0.H1;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+--:-:2:-:2      $convert loadA0, loadA0.H0;
+
+02:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadA;
+
+24:-:-:-:4      $convert loadB3, loadB1.H1;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      $convert loadB2, loadB1.H0;
+--:-:-:-:4      $convert loadB1, loadB0.H1;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+--:-:3:-:2      $convert loadB0, loadB0.H0;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+
+04:-:-:-:1      STS.128 [writeS + 4x<8*128>], loadB;
+
+    // scalar loads
+    } : qq{
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+02:-:-:-:4      $convert loadA0, loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:4      $convert loadA1, loadA1;
+--:-:-:-:4      $convert loadA2, loadA2;
+--:-:2:-:2      $convert loadA3, loadA3;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadA0;
+
+04:-:-:-:4      $convert loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      $convert loadB1, loadB1;
+--:-:-:-:4      $convert loadB2, loadB2;
+--:-:3:-:2      $convert loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+04:-:-:-:1      STS.128 [writeS + 4x<8*128>], loadB0;
+
+    };
++]
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+
+[+
+    our $vec;
+    our $convert;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n");
+
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        ($vec ?
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI.64 loadA0, [trackA];\n",
+        j0c13 => "--:-:3:-:1  \@P3 LDG.E.CI.64 loadB0, [trackB];\n",
+
+        j5c1  => "02:-:-:-:1  \@P2 $convert loadA3, loadA1.H1;\n",
+        j5c5  => "--:-:-:-:1  \@P2 $convert loadA2, loadA1;\n",
+        j5c9  => "--:-:-:-:1  \@P2 $convert loadA1, loadA0.H1;\n",
+        j5c13 => "--:-:2:-:1  \@P2 $convert loadA0, loadA0;\n",
+
+        j6c1  => "04:-:-:-:1  \@P3 $convert loadB3, loadB1.H1;\n",
+        j6c5  => "--:-:-:-:1  \@P3 $convert loadB2, loadB1;\n",
+        j6c9  => "--:-:-:-:1  \@P3 $convert loadB1, loadB0.H1;\n",
+        j6c13 => "--:-:3:-:1  \@P3 $convert loadB0, loadB0;\n",
+            ) :
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n",
+
+        j0c29 => "--:-:3:-:1  \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j0c31 => "--:-:3:-:1  \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j0c33 => "--:-:3:-:1  \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j0c35 => "--:-:3:-:1  \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j5c1  => "02:-:-:-:1  \@P2 $convert loadA0, loadA0;\n",
+        j5c5  => "--:-:-:-:1  \@P2 $convert loadA1, loadA1;\n",
+        j5c9  => "--:-:-:-:1  \@P2 $convert loadA2, loadA2;\n",
+        j5c13 => "--:-:2:-:1  \@P2 $convert loadA3, loadA3;\n",
+
+        j6c1  => "04:-:-:-:1  \@P3 $convert loadB0, loadB0;\n",
+        j6c5  => "--:-:-:-:1  \@P3 $convert loadB1, loadB1;\n",
+        j6c9  => "--:-:-:-:1  \@P3 $convert loadB2, loadB2;\n",
+        j6c13 => "--:-:3:-:1  \@P3 $convert loadB3, loadB3;\n",
+            )
+        ),
+
+        j5c31 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*128>], loadA;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c31 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<8*128>], loadB;\n",
+
+        j6c46 => "--:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
++]
+
+<INCLUDE file="hgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/hgemm_tn_128x16.sass b/Kernel/SGEMM/Maxwell/hgemm_tn_128x16.sass
new file mode 100644
index 0000000..5cd8cce
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_tn_128x16.sass
@@ -0,0 +1,554 @@
+# Kernel: hgemm_tn_128x16
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 16*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    16-17 : Rand<0-1>
+
+    18-47 ~ lda, ldb, ldaz, ldbz, lda8, ldb8, ta, tb, tid1, tid96, tidAX, tidBX, tidY, txa, txb, dimA, flag
+
+    0-15  : czero<00-15>
+
+    3, 2,11,10 : cx<0-3>y0
+    7, 6,15,14 : cx<0-3>y1
+    1, 0, 9, 8 : cx<0-3>y2
+    5, 4,13,12 : cx<0-3>y3
+
+    16-23   : j0Ay<0-3>, j0Bx<0-3>
+    24-31   : j1Ay<0-3>, j1Bx<0-3>
+    32-39   : j2Ay<0-3>, j2Bx<0-3>
+    40-47   : j3Ay<0-3>, j3Bx<0-3>
+
+    48-55   : load0A<0-7>
+    56-63   : load1A<0-7>
+    64-71   : load2A<0-7>
+    72-79   : load3A<0-7>
+
+    80-83   : load<0-3>B
+
+    84-87   : track0A<0-1>, track0B<0-1>
+    88-91   : track1A<0-1>, track1B<0-1>
+    92-95   : track2A<0-1>, track2B<0-1>
+    96-99   : track3A<0-1>, track3B<0-1>
+
+    100-104 ~ writeAs, writeBs, k, lda32, ldb32
+    105-112 ~ readAs, readBs, tid, blkA, blkB, blkZ, tbid, seed
+
+    16-25   : c<0-3>, b<0-1>, d3, d2, d1, d0
+    26-27   : Cy<0-1>
+    28-104  ~ ldc, ldcz, ldc1, writeCs, readCs, tidCX, tidCY, cx, cy, ci, xmad_c, alpha, beta, flags, tid31, lfsr<0-2>, exp<0-3>, rand<0-3>, lfsr<0-2>_1, lfsr<0-2>_2, clk_shf1, clk_shf2
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k, param_k;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+--:-:-:-:1      LDS.U.128 czero00, [addr_zero];
+--:-:-:-:1      LDS.U.128 czero04, [addr_zero];
+--:-:-:-:1      LDS.U.128 czero08, [addr_zero];
+--:-:-:-:1      LDS.U.128 czero12, [addr_zero];
+
+// Grab a seed for this thread
+// (blkB*gridDimA*256 + blkA*256 + tid) & (1024*256 - 1)
+--:-:-:-:1      MOV flag, param_flags;
+--:-:-:-:1      LOP.AND.NZ P4, RZ, flag, 0x1;
+--:-:-:-:1      MOV dimA, gridDimA;
+03:-:-:-:1      ISCADD tbid, blkA, tid, 8;
+04:-:-:-:1      XMAD.U16.U16 dimA, blkB, dimA, RZ;
+--:-:-:-:1      ISCADD tbid, dimA, tbid, 8;
+--:-:-:-:1      LOP.AND seed, tbid, 1x<2048*32 - 1>;
+--:-:-:-:1      LEA      Rand0.CC, seed, param_Rand[0],     0x2;
+--:-:-:-:1      LEA.HI.X Rand1,    seed, param_Rand[1], RZ, 0x2;
+--:-:-:-:1  @P4 LDG.E.CS seed, [Rand];
+
+// tidBX =  tid & 15
+// tidAX = (tid & 15) << 3
+// tidY = (tid >> 4) & 7
+01:-:-:-:1      LOP.AND tidBX, tid,   15;
+--:-:-:-:1      SHL     tidAX, tidBX, 3;
+--:-:-:-:1      BFE.U32 tidY,  tid,   0x304; // 3 bits at position 4
+
+--:-:-:-:1      MOV lda8,   param_lda8;
+--:-:-:-:1      MOV ldb8,   param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda8, 4;
+--:-:-:-:1      SHR.U32 ldb, ldb8, 4;
+--:-:-:-:1      SHL lda32, lda8, 2;
+--:-:-:-:1      SHL ldb32, ldb8, 2;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+
+// trackA += (blkA*128 + lda*tidY + tidAX) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidAX,  7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// trackB += (blkB*16 + ldb*tidY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 4;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      track0B0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb, param_B[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeAs = (128*tidY + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidAX, 7;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (16*tidY + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidBX, 4;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<128*8>, 2;
+
+// Start the read buffers low
+// readAs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+--:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readAs, readAs, tid96;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid & 0x10) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x10;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      IADD   track1A0.CC, track0A0, lda8;
+--:-:-:-:1      IADD.X track1A1,    track0A1, RZ;
+--:-:-:-:1      IADD   track1B0.CC, track0B0, ldb8;
+--:-:-:-:1      IADD.X track1B1,    track0B1, RZ;
+
+--:-:-:-:1      IADD   track2A0.CC, track1A0, lda8;
+--:-:-:-:1      IADD.X track2A1,    track1A1, RZ;
+--:-:-:-:1      IADD   track2B0.CC, track1B0, ldb8;
+--:-:-:-:1      IADD.X track2B1,    track1B1, RZ;
+
+--:-:-:-:1      IADD   track3A0.CC, track2A0, lda8;
+--:-:-:-:1      IADD.X track3A1,    track2A1, RZ;
+--:-:-:-:1      IADD   track3B0.CC, track2B0, ldb8;
+--:-:-:-:1      IADD.X track3B1,    track2B1, RZ;
+
+<ORDERED>
+--:-:3:-:1  @P5 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P6 LDG.E.CI.S16 load0B, [track0B];
+
+--:-:4:-:1  @P5 LDG.E.CI.128 load1A, [track1A];
+--:-:4:-:1  @P6 LDG.E.CI.S16 load1B, [track1B];
+
+--:-:5:-:1  @P5 LDG.E.CI.128 load2A, [track2A];
+--:-:5:-:1  @P6 LDG.E.CI.S16 load2B, [track2B];
+
+--:-:6:-:1  @P5 LDG.E.CI.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.S16 load3B, [track3B];
+</ORDERED>
+
+--:-:-:-:1      ISETP.GE.AND P0, PT, k, 32, PT;
+--:-:-:-:1      ISETP.GT.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GT.AND P4, PT, k, 32, P6;
+--:-:-:-:1      IADD k, k, -32;
+</SCHEDULE_BLOCK>
+
+04:-:-:-:4      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:4      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, lda32;
+--:-:-:-:4      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:1:-:4      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:0      IADD.X track0A1, track0A1, RZ;
+--:-:-:-:4      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:4      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:0      IADD   track0B0.CC, track0B0, ldb32;
+--:-:-:-:4      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:4      F2F.F32.F16 load0A0, load0A0.H0;
+--:-:-:-:0      IADD.X track0B1, track0B1, RZ;
+--:-:3:-:1      F2F.F32.F16 load0B, load0B;
+
+01:-:-:-:1      STS.128 [writeAs + 4x<0*(128*8 + 16*8) + 4>], load0A4;
+02:-:-:-:1      STS.128 [writeAs + 4x<0*(128*8 + 16*8) + 0>], load0A0;
+04:-:-:-:1      STS     [writeBs + 4x<0*(128*8 + 16*8) + 0>], load0B;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 0*(128*8 + 16*8)>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*16  + 0*(128*8 + 16*8)>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 0*(128*8 + 16*8)>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*16  + 0*(128*8 + 16*8)>];
+--:-:3:-:1  @P3 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P4 LDG.E.CI.S16 load0B, [track0B];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+
+    foreach my $k (0 .. 3)
+    {
+        my $shareBuf = ($k + 1) & 1;
+        my $store   = ($k + 1) & 3;
+        my $loadBar = $store + 3;
+        my $storBar = sprintf '%02x', 1 << ($store + 2);
+
+        %insert =
+        (
+            j0c11 => "$storBar:-:-:-:1  \@P0 F2F.F32.F16 load${store}A7, load${store}A3.H1;\n",
+            j0c15 => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A6, load${store}A3.H0;\n",
+            j1c3  => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A5, load${store}A2.H1;\n",
+            j1c7  => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A4, load${store}A2.H0;\n",
+            j1c11 => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A3, load${store}A1.H1;\n",
+            j1c15 => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A2, load${store}A1.H0;\n",
+            j2c3  => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A1, load${store}A0.H1;\n",
+            j2c7  => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A0, load${store}A0.H0;\n",
+            j2c11 => "--:-:$loadBar:-:1  \@P0 F2F.F32.F16 load${store}B, load${store}B;\n",
+
+            j2c12 => "--:-:-:-:1  \@P0 IADD   track${store}A0.CC, track${store}A0, lda32;\n",
+            j3c1  => "--:-:-:-:1  \@P0 IADD.X track${store}A1,    track${store}A1, RZ;\n",
+            j3c3  => "--:-:-:-:1  \@P0 IADD   track${store}B0.CC, track${store}B0, ldb32;\n",
+            j3c8  => "--:-:-:-:1  \@P0 IADD.X track${store}B1,    track${store}B1, RZ;\n",
+
+            j3c9  => "$storBar:-:-:-:1  \@P0 STS.128 [writeAs + 4x<$shareBuf*(128*8 + 16*8) + 0>], load${store}A0;\n",
+            j4c4 => "--:-:-:-:1  \@P0 STS.128 [writeAs + 4x<$shareBuf*(128*8 + 16*8) + 4>], load${store}A4;\n",
+            j4c6 => "--:-:-:-:1  \@P0 STS     [writeBs + 4x<$shareBuf*(128*8 + 16*8) + 0>], load${store}B;\n",
+
+            j5c15 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n",
+
+            j6c1  => "--:-:$loadBar:-:1  \@P3 LDG.E.CI.128 load${store}A, [track${store}A];\n",
+            j6c3  => "--:-:$loadBar:-:1  \@P4 LDG.E.CI.S16 load${store}B, [track${store}B];\n",
+
+            ($k == 3 ?
+                (
+                j0c4  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 32, PT;\n",
+                j0c6  => "--:-:-:-:1      ISETP.GT.AND P3, PT, k, 32, P5;\n",
+                j0c8  => "--:-:-:-:1      ISETP.GT.AND P4, PT, k, 32, P6;\n",
+                j0c10 => "--:-:-:-:1      IADD k, k, -32;\n",
+
+                j7c15 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+                ) : ()
+            ),
+        );
+
+        foreach my $j (0 .. 7)
+        {
+            my $rsPred    = $j >= 6 && $k == 3 ? '@P0' : '   ';
+            my $barrier   = $j & 1 ? 2 : 1;
+            my $loadReg   = ($j + 2) & 3;
+            my $compute   = $j & 3;
+            my $shareLine = ($j + 2) & 7;
+            $shareBuf     = $j >= 6 ? ($k + 1) & 1 : $k & 1;
+
+            $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + %d*(128*8 + 16*8)>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shareBuf;
+            $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*16  + %d*(128*8 + 16*8)>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shareBuf;
+
+            foreach my $c (0 .. 15)
+            {
+                my ($x,$y) = @{$cOrder[$c]};
+
+                my $ins    = $insert{"j${j}c$c"} || '';
+
+                my $wait   = $c == 0 ? "0$barrier" : '--';
+
+                my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+                my $yield  = $c == 8 && $stall ? 'Y' : '-';
+
+                my $ctrl   = "$wait:-:-:$yield:$stall";
+
+                $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+            }
+        }
+        $out .= "\n";
+    }
+    return $out;
+
+</CODE>
+
+//<INCLUDE file="hgemm_common_128x16.sass"/>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 16 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0x1ff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0x1ff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 2;
+
+// tidCX = (tid & 3) << 2
+// tidCY = tid >> 2
+--:-:-:-:1      LOP.AND tid31, tid,   31;
+--:-:-:-:1      LOP.AND tidCX, tid,   3;
+--:-:-:-:1      SHL     tidCX, tidCX, 2;
+--:-:-:-:1      SHR.U32 tidCY, tid,   2;
+
+// readCs = (tidCY*16 + tidCX)   << 2;
+--:-:-:-:1      ISCADD readCs, tidCY, tidCX, 4;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*16 + tidCX;
+--:-:-:-:1      ISCADD cx, blkB, tidCX, 4;
+
+// cy = blkA*128 + tidCY*4
+--:-:-:-:1      SHL     cy, tidCY, 2;
+--:-:-:-:1      ISCADD  cy, blkA,  cy, 7;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ,  ci;
+--:-:-:-:1      LEA      Cy0.CC, ci, param_C[0],     1;
+--:-:-:-:0      LEA.HI.X Cy1,    ci, param_C[1], RZ, 1;
+
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+// Random Round flag
+--:-:-:-:2      LOP.AND.NZ P4, RZ, flags, 1;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P3, RZ, flags, 2;
+
+--:-:-:-:1      SHL ldc1, ldc, 1;
+
+// Seed the Tausworthe
+--:-:-:-:1      LOP.XOR lfsr0, seed, tbid;
+--:-:-:-:1      CS2R lfsr1, SR_CLOCKLO;
+--:-:-:-:1      CS2R lfsr2, SR_GLOBALTIMERLO;
+--:-:-:-:1      LOP.AND clk_shf1, lfsr1, 31;
+--:-:-:-:1      LOP.AND clk_shf2, lfsr2, 31;
+--:-:-:-:1      LOP.XOR clk_shf1, clk_shf1, tid31;
+--:-:-:-:1      LOP.XOR clk_shf2, clk_shf2, tid31;
+--:-:-:-:1      SHF.R.U64 lfsr1, lfsr1, clk_shf1, lfsr1;
+--:-:-:-:1      SHF.R.U64 lfsr2, lfsr2, clk_shf2, lfsr2;
+--:-:-:-:1      LOP.AND tbid, tbid, 1x<2048*32 - 1>;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..3)
+    {
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:6      LEA      Rand0.CC, tbid, param_Rand[0],     0x2;
+--:-:-:-:1      LEA.HI.X Rand1,    tbid, param_Rand[1], RZ, 0x2;
+--:-:-:-:2      LOP3.LUT seed, lfsr0, lfsr1, lfsr2, 0x96;
+--:-:-:-:1  @P4 STG.E.CS [Rand], seed;
+
+--:-:-:-:5      EXIT;
+
+
+STORE_C:
+
+--:-:-:-:2      ISETP.LT.AND P1, PT, cy, param_m, P5;
+--:-:-:Y:b      ISETP.LT.AND P0, PT, cy, param_m, P6;
+--:-:-:-:0      IADD cy, cy, 1;
+
+--:-:1:-:1  @P1 LDG.E.64 b0, [Cy];
+
+// Apply relu
+--:-:-:-:1  @P3 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P3 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P3 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:4  @P3 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:5:-:1      LDS.U.128 c0, [readCs];
+
+01:-:1:-:4  @P1 F2F.F32.F16 d3, b1.H1;
+--:-:2:-:4  @P1 F2F.F32.F16 d2, b1.H0;
+--:-:3:-:4  @P1 F2F.F32.F16 d1, b0.H1;
+--:-:4:-:1  @P1 F2F.F32.F16 d0, b0.H0;
+
+11:-:-:-:1  @P1 FFMA c3, d3, beta, c3;
+02:-:-:-:1  @P1 FFMA c2, d2, beta, c2;
+04:-:-:-:1  @P1 FFMA c1, d1, beta, c1;
+08:-:-:-:0  @P1 FFMA c0, d0, beta, c0;
+
+--:-:-:-:5  @P4 BRA.U DO_RANDOM1;
+
+--:-:1:-:4      F2F.F16.F32 c0, c0;
+--:-:2:-:4      F2F.F16.F32 c1, c1;
+--:-:3:-:4      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+--:-:-:-:5      BRA.U END_ROUND1;
+
+DO_RANDOM1:
+
+--:-:-:-:5      CAL RANDOM_ROUND;
+
+END_ROUND1:
+
+// Pack 2 16 bit values into 32 bit words
+03:-:-:-:2      BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2      BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:2  @P0 STG.E.64 [Cy], c0;
+
+01:-:-:-:6      IADD   Cy0.CC, Cy0, ldc1;
+--:-:-:-:0      IADD.X Cy1,    Cy1, RZ;
+
+--:-:-:-:5      RET;
+
+RANDOM_ROUND:
+
+<SCHEDULE_BLOCK>
+
+// Strip mantissa and leave sign+exponent
+--:-:-:-:1      LOP32I.AND exp0, c0, 0xff800000;
+--:-:-:-:1      LOP32I.AND exp1, c1, 0xff800000;
+--:-:-:-:1      LOP32I.AND exp2, c2, 0xff800000;
+--:-:-:-:1      LOP32I.AND exp3, c3, 0xff800000;
+
+// Find the exponent that will shift 32 bits of integer data
+// out past the lsb of this number as an fp16
+// exp *= 2^-10 * 2^-32  (2^-42)
+--:-:-:-:1      FMUL32I exp0, exp0, 0x2a800000;
+--:-:-:-:1      FMUL32I exp1, exp1, 0x2a800000;
+--:-:-:-:1      FMUL32I exp2, exp2, 0x2a800000;
+--:-:-:-:1      FMUL32I exp3, exp3, 0x2a800000;
+
+// lfsr0 = ((lfsr0 & 0xfffffffe) << 12) ^ (((lfsr0 << 13) ^ lfsr0) >> 19);
+--:-:-:-:1      LOP32I.AND lfsr0_1, lfsr0, 0xfffffffe;
+--:-:-:-:1      SHL lfsr0_1, lfsr0_1, 12;
+--:-:-:-:1      SHL lfsr0_2, lfsr0, 13;
+--:-:-:-:1      LOP.XOR lfsr0_2, lfsr0_2, lfsr0;
+--:-:-:-:1      SHR.U32 lfsr0_2, lfsr0_2, 19;
+--:-:-:-:1      LOP.XOR lfsr0, lfsr0_1, lfsr0_2;
+
+// lfsr1 = ((lfsr1 & 0xfffffff8) <<  4) ^ (((lfsr1 << 2)  ^ lfsr1) >> 25);
+--:-:-:-:1      LOP32I.AND lfsr1_1, lfsr1, 0xfffffff8;
+--:-:-:-:1      SHL lfsr1_1, lfsr1_1, 4;
+--:-:-:-:1      SHL lfsr1_2, lfsr1, 2;
+--:-:-:-:1      LOP.XOR lfsr1_2, lfsr1_2, lfsr1;
+--:-:-:-:1      SHR.U32 lfsr1_2, lfsr1_2, 25;
+--:-:-:-:1      LOP.XOR lfsr1, lfsr1_1, lfsr1_2;
+
+// lfsr2 = ((lfsr2 & 0xfffffff0) << 11) ^ (((lfsr2 << 3)  ^ lfsr2) >> 11);
+--:-:-:-:1      LOP32I.AND lfsr2_1, lfsr2, 0xfffffff0;
+--:-:-:-:1      SHL lfsr2_1, lfsr2_1, 11;
+--:-:-:-:1      SHL lfsr2_2, lfsr2, 3;
+--:-:-:-:1      LOP.XOR lfsr2_2, lfsr2_2, lfsr2;
+--:-:-:-:1      SHR.U32 lfsr2_2, lfsr2_2, 11;
+--:-:-:-:1      LOP.XOR lfsr2, lfsr2_1, lfsr2_2;
+
+// rand = lfsr0 ^ lfsr1 ^ lfsr2;
+// generate 3 other rotations of this rand
+--:-:-:-:1      LOP3.LUT  rand0, lfsr0, lfsr1, lfsr2, 0x96;
+--:-:-:-:1      SHF.R.U64 rand1, rand0,  8, rand0;
+--:-:-:-:1      SHF.R.U64 rand2, rand0, 16, rand0;
+--:-:-:-:0      SHF.R.U64 rand3, rand0, 24, rand0;
+//--:-:-:-:1      MOV32I rand0, 0x80000000;
+//--:-:-:-:1      MOV32I rand1, 0x80000000;
+//--:-:-:-:1      MOV32I rand2, 0x80000000;
+//--:-:-:-:1      MOV32I rand3, 0x80000000;
+</SCHEDULE_BLOCK>
+
+// Convert rand to float
+--:-:1:-:4      I2F.F32.U32.RZ rand0, rand0;
+--:-:2:-:4      I2F.F32.U32.RZ rand1, rand1;
+--:-:3:-:4      I2F.F32.U32.RZ rand2, rand2;
+--:-:4:-:1      I2F.F32.U32.RZ rand3, rand3;
+
+// Scale the random number so msb is one below lsb of fp16
+// Add scaled random to number to round
+01:-:-:-:1      FFMA.RZ c0, rand0, exp0, c0;
+02:-:-:-:1      FFMA.RZ c1, rand1, exp1, c1;
+04:-:-:-:1      FFMA.RZ c2, rand2, exp2, c2;
+08:-:-:-:0      FFMA.RZ c3, rand3, exp3, c3;
+
+// Truncate number to fp16
+--:-:1:-:4      F2F.F16.F32.RZ c0, c0;
+--:-:2:-:4      F2F.F16.F32.RZ c1, c1;
+--:-:3:-:4      F2F.F16.F32.RZ c2, c2;
+--:-:4:-:1      F2F.F16.F32.RZ c3, c3;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/hgemm_tn_128x32.sass b/Kernel/SGEMM/Maxwell/hgemm_tn_128x32.sass
new file mode 100644
index 0000000..239d5d3
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_tn_128x32.sass
@@ -0,0 +1,553 @@
+# Kernel: hgemm_tn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*16*2 + 32*16*2>
+    szShareA  : 128*16
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 4;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL lda16, lda, 5;
+--:-:-:-:1      SHL ldb16, ldb, 5;
+--:-:-:-:1      SHL lda4,  lda, 2;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX = (tid & 31) << 2
+// tidAY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidAX, tid,   31;
+--:-:-:-:1      SHL     tidAX, tidAX, 2;
+--:-:-:-:1      SHR.U32 tidAY, tid,   5;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += (blkA*128 + tidAX + lda*tidAY) * 4
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO2 ta0, lda,  tidAY, txa;
+08:-:-:-:1      XMAD.LO2 ta0, ldaz, blkZ,  ta0;
+--:-:-:-:1      IADD     ta1, ta0, lda4;
+--:-:-:-:1      IADD     ta2, ta1, lda4;
+--:-:-:-:1      IADD     ta3, ta2, lda4;
+
+--:-:-:-:1      LEA      track0A0.CC, ta0, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta0, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track1A0.CC, ta1, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track1A1,    ta1, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track2A0.CC, ta2, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track2A1,    ta2, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track3A0.CC, ta3, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track3A1,    ta3, param_A[1], RZ, 1;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidBX,  5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 1;
+
+// writeAs = (tidAY*128 + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidAY1, tidAY, 4;
+--:-:-:-:1      IADD tidAY2, tidAY, 8;
+--:-:-:-:1      IADD tidAY3, tidAY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.64 load0A, [track0A];
+--:-:2:-:1  @P1 LDG.E.CI.64 load1A, [track1A];
+--:-:3:-:1  @P2 LDG.E.CI.64 load2A, [track2A];
+--:-:4:-:1  @P3 LDG.E.CI.64 load3A, [track3A];
+--:-:5:-:1  @P4 LDG.E.CI.64 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.64 load0A, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.64 load1A, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.64 load2A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 load3A, [addr_zero];
+--:-:6:-:2 @!P4 LDS.U.64 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD txa1,  txa,  1;
+--:-:-:-:1      IADD txa2,  txa,  2;
+--:-:-:-:1      IADD txa3,  txa,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:1:-:1  @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:1:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:1:-:1  @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];
+--:-:5:-:1  @P1 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];
+--:-:5:-:1  @P2 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 15;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+21:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:1:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+
+02:-:-:-:1      F2F.F32.F16 load1A3, load1A1.H1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A1.H0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A0.H1;
+--:-:2:-:1      F2F.F32.F16 load1A0, load1A0.H0;
+
+04:-:-:-:1      F2F.F32.F16 load2A3, load2A1.H1;
+--:-:-:-:1      F2F.F32.F16 load2A2, load2A1.H0;
+--:-:-:-:1      F2F.F32.F16 load2A1, load2A0.H1;
+--:-:3:-:1      F2F.F32.F16 load2A0, load2A0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load3A3, load3A1.H1;
+--:-:-:-:1      F2F.F32.F16 load3A2, load3A1.H0;
+--:-:-:-:1      F2F.F32.F16 load3A1, load3A0.H1;
+--:-:4:-:1      F2F.F32.F16 load3A0, load3A0.H0;
+
+10:-:-:-:1      F2F.F32.F16 loadB3, loadB1.H1;
+--:-:-:-:1      F2F.F32.F16 loadB2, loadB1.H0;
+--:-:-:-:1      F2F.F32.F16 loadB1, loadB0.H1;
+--:-:5:-:1      F2F.F32.F16 loadB0, loadB0.H0;
+    } : q{
+21:-:-:-:1      F2F.F32.F16 load0A0, load0A0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:1:-:1      F2F.F32.F16 load0A3, load0A3;
+
+02:-:-:-:1      F2F.F32.F16 load1A0, load1A0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A2;
+--:-:2:-:1      F2F.F32.F16 load1A3, load1A3;
+
+04:-:-:-:1      F2F.F32.F16 load2A0, load2A0;
+--:-:-:-:1      F2F.F32.F16 load2A1, load2A1;
+--:-:-:-:1      F2F.F32.F16 load2A2, load2A2;
+--:-:3:-:1      F2F.F32.F16 load2A3, load2A3;
+
+08:-:-:-:1      F2F.F32.F16 load3A0, load3A0;
+--:-:-:-:1      F2F.F32.F16 load3A1, load3A1;
+--:-:-:-:1      F2F.F32.F16 load3A2, load3A2;
+--:-:4:-:1      F2F.F32.F16 load3A3, load3A3;
+
+10:-:-:-:1      F2F.F32.F16 loadB0, loadB0;
+--:-:-:-:1      F2F.F32.F16 loadB1, loadB1;
+--:-:-:-:1      F2F.F32.F16 loadB2, loadB2;
+--:-:5:-:1      F2F.F32.F16 loadB3, loadB3;
+    };
+</CODE>
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 16, P1;
+
+01:-:-:-:1      STS.128 [writeAs + 4x<0*128>], load0A;
+--:-:-:-:6      IADD   track0A0.CC, track0A0, lda16;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<4*128>], load1A;
+--:-:-:-:6      IADD   track1A0.CC, track1A0, lda16;
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS.128 [writeAs + 4x<8*128>], load2A;
+--:-:-:-:6      IADD   track2A0.CC, track2A0, lda16;
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS.128 [writeAs + 4x<12*128>], load3A;
+--:-:-:-:6      IADD   track3A0.CC, track3A0, lda16;
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1,    trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P5 LDG.E.CI.64 load0A, [track0A];
+--:-:4:-:1  @P5 LDG.E.CI.64 load1A, [track1A];
+--:-:5:-:1  @P5 LDG.E.CI.64 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.CI.64 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.64 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P5 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:3:-:1  @P5 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:3:-:1  @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:3:-:1  @P5 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+
+--:-:4:-:1  @P5 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:4:-:1  @P5 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:4:-:1  @P5 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:4:-:1  @P5 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 0;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n",
+        j9c6   => "10:5:-:-:1  \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n",
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, lda16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, lda16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2A0.CC, track2A0, lda16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, lda16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.64 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.64 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.64 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.64 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.64 loadB,  [trackB];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1A3, load1A1.H1;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A2, load1A1.H0;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A1, load1A0.H1;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1A0, load1A0.H0;\n",
+
+                j6c13  => "10:-:-:-:1  \@P5 F2F.F32.F16 load2A3, load2A1.H1;\n",
+                j6c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2A2, load2A1.H0;\n",
+                j6c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2A1, load2A0.H1;\n",
+                j6c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load2A0, load2A0.H0;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A3, load3A1.H1;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A2, load3A1.H0;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A1, load3A0.H1;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3A0, load3A0.H0;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n",
+            ) :
+            (
+
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0A0, load0A0;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A1, load0A1;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A2, load0A2;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0A3, load0A3;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1A0, load1A0;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A1, load1A1;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A2, load1A2;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1A3, load1A3;\n",
+
+                j6c13  => "10:-:-:-:1  \@P5 F2F.F32.F16 load2A0, load2A0;\n",
+                j6c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2A1, load2A1;\n",
+                j6c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2A2, load2A2;\n",
+                j6c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load2A3, load2A3;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A0, load3A0;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A1, load3A1;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A2, load3A2;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3A3, load3A3;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB3;\n",
+                j10c17 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB2, loadB2;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB0, loadB0;\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="hgemm_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/hgemm_tn_128x64.sass b/Kernel/SGEMM/Maxwell/hgemm_tn_128x64.sass
new file mode 100644
index 0000000..0404ab5
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/hgemm_tn_128x64.sass
@@ -0,0 +1,389 @@
+# Kernel: hgemm_tn_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tid, blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, ta, tb, tid1, tid15, tidX, x<1-3|65-67>, y<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+     96-107 : loadA<0-7>,  loadB<0-3>
+    108-111 : trackA<0-1>, trackB<0-1>
+
+    112-122 ~ writeAs, writeBs, k, txa00, txa64, txb, tidY, swapBuf
+    123-127 : readAs, readBs
+
+    64-83   ~ ldc, ldcz, ci, xmad_c, threadId, tid31, tid96, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C04y<0-1>, C08y<0-1>, C12y<0-1>, C00y<0-1>
+    86-107  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tidX = (tid & 15) << 2
+// tidY = (tid >> 4) & 7
+01:-:-:-:1      LOP.AND tid15, tid,  15;
+--:-:-:-:1      SHL     tidX, tid15, 2;
+--:-:-:-:1      BFE.U32 tidY,  tid,  0x304; // 3 bits at position 4
+
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 4;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// trackA += (blkA*128 + lda*tidY + tidX) * 2
+02:-:-:-:1      ISCADD   txa00, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa00;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x1;
+--:-:-:-:1      IADD txa64, txa00, 64;
+
+// trackB += (blkB*64 + ldb*tidY + tidX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 6;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x1;
+
+// Start the write buffers high
+// writeAs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2;
+// writeBs = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2;
+
+// Start the read buffers low
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<64*8 + 128*8>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+// doLoad = tidY < k && txa00|txb < n|m
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY, k, P6;
+
+<ORDERED>
+--:-:2:-:1  @P4 LDG.E.CI.64 loadA0, [trackA + 2x<00>];
+--:-:3:-:1  @P5 LDG.E.CI.64 loadA4, [trackA + 2x<64>];
+--:-:4:-:1  @P6 LDG.E.CI.64 loadB0, [trackB];
+
+--:-:5:-:1 @!P4 LDS.U.64 loadA0, [addr_zero];
+--:-:5:-:1 @!P5 LDS.U.64 loadA4, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.64 loadB0, [addr_zero];
+</ORDERED>
+
+    } : q{
+// doLoadA = tidY < k && txa00 < m
+// doLoadB = tidY < k && txb < n
+--:-:-:-:1      IADD x1,  txa00, 1;
+--:-:-:-:1      IADD x2,  txa00, 2;
+--:-:-:-:1      IADD x3,  txa00, 3;
+--:-:-:-:1      IADD x65, txa64, 1;
+--:-:-:-:1      IADD x66, txa64, 2;
+--:-:-:-:1      IADD x67, txa64, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_m, P0;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<00 + 0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<00 + 1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<00 + 2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<00 + 3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x65, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x66, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x67, param_m, P0;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.S16 loadA4, [trackA + 2x<00 + 64>];
+--:-:3:-:1  @P1 LDG.E.CI.S16 loadA5, [trackA + 2x<00 + 65>];
+--:-:3:-:1  @P2 LDG.E.CI.S16 loadA6, [trackA + 2x<00 + 66>];
+--:-:3:-:1  @P3 LDG.E.CI.S16 loadA7, [trackA + 2x<00 + 67>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA4, RZ;
+--:-:-:-:1 @!P1 MOV loadA5, RZ;
+--:-:-:-:1 @!P2 MOV loadA6, RZ;
+--:-:-:-:1 @!P3 MOV loadA7, RZ;
+
+--:-:-:-:1      IADD y1, txb, 1;
+--:-:-:-:1      IADD y2, txb, 2;
+--:-:-:-:1      IADD y3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_n, P0;
+
+--:-:4:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>];
+--:-:4:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>];
+--:-:4:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>];
+--:-:4:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+    };
+</CODE>
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 7;
+
+12:-:-:-:4      F2F.F32.F16 loadA3, loadA1.H1;
+--:-:-:-:4      F2F.F32.F16 loadA2, loadA1.H0;
+--:-:-:-:4      F2F.F32.F16 loadA1, loadA0.H1;
+--:-:2:-:4      F2F.F32.F16 loadA0, loadA0.H0;
+
+04:-:-:-:4      F2F.F32.F16 loadA7, loadA5.H1;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:4      F2F.F32.F16 loadA6, loadA5.H0;
+--:-:-:-:4      F2F.F32.F16 loadA5, loadA4.H1;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+--:-:3:-:1      F2F.F32.F16 loadA4, loadA4.H0;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<00>], loadA0;
+04:-:-:-:1      STS.128 [writeAs + 4x<64>], loadA4;
+
+28:-:-:-:4      F2F.F32.F16 loadB3, loadB1.H1;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      F2F.F32.F16 loadB2, loadB1.H0;
+--:-:-:-:4      F2F.F32.F16 loadB1, loadB0.H1;
+--:-:2:-:2      F2F.F32.F16 loadB0, loadB0.H0;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+02:-:-:-:1      STS.128 [writeBs], loadB0;
+
+    // scalar loads
+    } : q{
+// bDoRemainder = k > 8
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+02:-:-:-:4      F2F.F32.F16 loadA0, loadA0;
+--:-:-:-:4      F2F.F32.F16 loadA1, loadA1;
+--:-:-:-:4      F2F.F32.F16 loadA2, loadA2;
+--:-:2:-:4      F2F.F32.F16 loadA3, loadA3;
+
+04:-:-:-:4      F2F.F32.F16 loadA4, loadA4;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:4      F2F.F32.F16 loadA5, loadA5;
+--:-:-:-:4      F2F.F32.F16 loadA6, loadA6;
+--:-:3:-:1      F2F.F32.F16 loadA7, loadA7;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<00>], loadA0;
+04:-:-:-:1      STS.128 [writeAs + 4x<64>], loadA4;
+
+08:-:-:-:4      F2F.F32.F16 loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      F2F.F32.F16 loadB1, loadB1;
+--:-:-:-:4      F2F.F32.F16 loadB2, loadB2;
+--:-:2:-:2      F2F.F32.F16 loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+02:-:-:-:1      STS.128 [writeBs], loadB0;
+
+    };
+</CODE>
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P4, PT, k, $k_end, P4;\n");
+
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, $k_end, P5;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, $k_end, P6;\n",
+        j0c5  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j0c7  => "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        ($vec ?
+            (
+        j0c8  => "--:-:2:-:1  \@P4 LDG.E.CI.64 loadA0, [trackA + 2x<00>];\n",
+        j0c11 => "--:-:3:-:1  \@P5 LDG.E.CI.64 loadA4, [trackA + 2x<64>];\n",
+        j0c14 => "--:-:4:-:1  \@P6 LDG.E.CI.64 loadB0, [trackB];\n",
+
+        j4c3  => "02:-:-:-:1  \@P4 F2F.F32.F16 loadA3, loadA1.H1;\n",
+        j4c7  => "--:-:-:-:1  \@P4 F2F.F32.F16 loadA2, loadA1.H0;\n",
+        j4c11 => "--:-:-:-:1  \@P4 F2F.F32.F16 loadA1, loadA0.H1;\n",
+        j4c15 => "--:-:2:-:1  \@P4 F2F.F32.F16 loadA0, loadA0.H0;\n",
+
+        j5c3  => "04:-:-:-:1  \@P5 F2F.F32.F16 loadA7, loadA5.H1;\n",
+        j5c7  => "--:-:-:-:1  \@P5 F2F.F32.F16 loadA6, loadA5.H0;\n",
+        j5c11 => "--:-:-:-:1  \@P5 F2F.F32.F16 loadA5, loadA4.H1;\n",
+        j5c15 => "--:-:3:-:1  \@P5 F2F.F32.F16 loadA4, loadA4.H0;\n",
+
+        j6c3  => "08:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n",
+        j6c7  => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n",
+        j6c11 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n",
+        j6c15 => "--:-:4:-:1  \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n",
+            ) :
+            (
+        j0c10 => "--:-:2:-:1  \@P4 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P4 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P4 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P4 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n",
+
+        j0c33 => "--:-:3:-:1  \@P5 LDG.E.CI.S16 loadA4, [trackA + 2x<64>];\n",
+        j0c35 => "--:-:3:-:1  \@P5 LDG.E.CI.S16 loadA5, [trackA + 2x<65>];\n",
+        j0c37 => "--:-:3:-:1  \@P5 LDG.E.CI.S16 loadA6, [trackA + 2x<66>];\n",
+        j0c39 => "--:-:3:-:1  \@P5 LDG.E.CI.S16 loadA7, [trackA + 2x<67>];\n",
+
+        j1c10 => "--:-:4:-:1  \@P6 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j1c12 => "--:-:4:-:1  \@P6 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j1c14 => "--:-:4:-:1  \@P6 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j1c16 => "--:-:4:-:1  \@P6 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j4c3  => "02:-:-:-:1  \@P4 F2F.F32.F16 loadA0, loadA0;\n",
+        j4c7  => "--:-:-:-:1  \@P4 F2F.F32.F16 loadA1, loadA1;\n",
+        j4c11 => "--:-:-:-:1  \@P4 F2F.F32.F16 loadA2, loadA2;\n",
+        j4c15 => "--:-:2:-:1  \@P4 F2F.F32.F16 loadA3, loadA3;\n",
+
+        j5c3  => "04:-:-:-:1  \@P5 F2F.F32.F16 loadA4, loadA4;\n",
+        j5c7  => "--:-:-:-:1  \@P5 F2F.F32.F16 loadA5, loadA5;\n",
+        j5c11 => "--:-:-:-:1  \@P5 F2F.F32.F16 loadA6, loadA6;\n",
+        j5c15 => "--:-:3:-:1  \@P5 F2F.F32.F16 loadA7, loadA7;\n",
+
+        j6c3  => "08:-:-:-:1  \@P6 F2F.F32.F16 loadB0, loadB0;\n",
+        j6c7  => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB1;\n",
+        j6c11 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB2;\n",
+        j6c15 => "--:-:4:-:1  \@P6 F2F.F32.F16 loadB3, loadB3;\n",
+            )
+        ),
+
+        j4c31 => "02:-:-:-:1  \@P0 STS.128 [writeAs + 4x<00>], loadA0;\n",
+        j5c31 => "04:-:-:-:1  \@P0 STS.128 [writeAs + 4x<64>], loadA4;\n",
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c31 => "08:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c46 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j6c54 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="hgemm_common_128x64.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_common_128x128.sass b/Kernel/SGEMM/Maxwell/sgemm_common_128x128.sass
new file mode 100644
index 0000000..703af8f
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_common_128x128.sass
@@ -0,0 +1,309 @@
+# sgemm_common_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:1      IADD loop, loop, 1;
+--:-:-:-:1      IADD ta, ta, param_ldaz;
+--:-:-:-:1      IADD tb, tb, param_ldbz;
+--:-:-:-:3      MOV  k, param_k;
+--:-:-:-:1      ISETP.LT.AND P1, PT, loop, param_loops, PT;
+--:-:-:-:6      LEA      trackA0.CC, ta, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 2;
+--:-:-:-:6      LEA      trackB0.CC, tb, param_B[0],     2;
+--:-:-:-:0      LEA.HI.X trackB1,    tb, param_B[1], RZ, 2;
+--:-:-:Y:5  @P1 BRA.U REMAINDER;
+
+--:-:1:-:1      S2R blockA, SR_CTAID.Y;
+--:-:2:-:1      S2R blockB, SR_CTAID.Z;
+--:-:3:-:1      S2R blockZ, SR_CTAID.X;
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND tid_31,  tid, 31;
+--:-:-:-:1      LOP.AND tid_96,  tid, 96;
+--:-:-:-:1      LOP.AND tid_128, tid, 128;
+
+// writeCs = (readAs / 4) * 128 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;
+
+// cx = tid_31 | (tid_128 >> 2);
+--:-:-:-:1      SHR.U32  cx00, tid_128, 2;
+--:-:-:-:1      LOP.OR   cx00, tid_31,  cx00;
+
+// readCs = ((tid_96 << 4) | cx) << 2;
+--:-:-:-:1      SHL      readCs, tid_96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, cx00;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// cx += blockB*128;
+02:-:-:-:1      ISCADD  cx00, blockB, cx00, 7;
+--:-:-:-:1      IADD    cx64, cx00, 64;
+
+// cy = blockA*128 + (tid_96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid_96, 1;
+01:-:-:-:1      ISCADD  cy00, blockA, cy00, 7;
+
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00,   cx00, xmad_c;
+04:-:-:-:1      XMAD.LO2 ci, ldcz, blockZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// Apply beta
+--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y + 4x<00>];
+--:-:2:-:1  @P1 LDG.E d1, [C00y + 4x<64>];
+--:-:3:-:1  @P2 LDG.E d2, [C04y + 4x<00>];
+--:-:4:-:1  @P3 LDG.E d3, [C04y + 4x<64>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c4, c4, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c5, c5, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c6, c6, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c7, c7, RZ, !PT;
+
+// beta != 0
+--:-:-:-:7      ISETP.NE.AND P6, PT, beta, RZ, PT;
+
+<ORDERED>
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], c4;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*128 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<0*128 + 64>];
+
+--:-:-:-:1      LDS c2, [readCs + 4x<1*128 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<1*128 + 64>];
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:1  @P6 FFMA c3, d3, beta, c3;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+--:-:-:-:1  @P0 STG.E.CG [C00y0 + 4x<00>], c0;
+--:5:-:-:1  @P1 STG.E.CG [C00y0 + 4x<64>], c1;
+--:-:-:-:1  @P2 STG.E.CG [C04y0 + 4x<00>], c2;
+--:6:-:-:1  @P3 STG.E.CG [C04y0 + 4x<64>], c3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C08y0 + 4x<00>];
+--:-:2:-:1  @P1 LDG.E d1, [C08y0 + 4x<64>];
+--:-:3:-:1  @P2 LDG.E d2, [C12y0 + 4x<00>];
+--:-:4:-:1  @P3 LDG.E d3, [C12y0 + 4x<64>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+--:-:-:-:2      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:2      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+</SCHEDULE_BLOCK>
+
+10:-:-:-:4      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD   cy08, cy08, 1;
+--:-:-:-:1      IADD   cy12, cy12, 1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+20:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:0      IADD.X C04y1,    C04y1, RZ;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<2*128 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<2*128 + 64>];
+--:-:-:-:1      LDS c2, [readCs + 4x<3*128 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*128 + 64>];
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P6 FFMA c3, d3, beta, c3;
+
+01:-:-:-:1  @P0 STG.E.CG [C08y0 + 4x<00>], c0;
+02:5:-:-:1  @P1 STG.E.CG [C08y0 + 4x<64>], c1;
+04:-:-:-:1  @P2 STG.E.CG [C12y0 + 4x<00>], c2;
+08:6:-:-:1  @P3 STG.E.CG [C12y0 + 4x<64>], c3;
+
+10:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+20:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/sgemm_common_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_common_128x32.sass
new file mode 100644
index 0000000..928ad6b
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_common_128x32.sass
@@ -0,0 +1,240 @@
+# sgemm_common_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+    
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 32 + readBs;
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 3;
+
+// readCs = ((tid & 96) << 2) | (tid & 31)   << 2;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 2;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid31;
+--:-:-:-:1      ISCADD cx, blkB, tid31, 5;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;
+
+// Apply relu
+--:-:-:-:0      LOP.AND.NZ   P4, RZ, flags, 2;
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6; 
+
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:4      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      MOV d0, RZ;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:4      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      MOV d1, RZ;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:3      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      MOV d2, RZ;
+--:-:-:-:1      MOV d3, RZ;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y];
+--:-:2:-:1  @P1 LDG.E d1, [C04y];
+--:-:3:-:1  @P2 LDG.E d2, [C08y];
+--:-:4:-:1  @P3 LDG.E d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:3      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*32>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*32>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*32>];
+</SCHEDULE_BLOCK>
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P5 FFMA c3, d3, beta, c3;
+
+--:1:-:-:1  @P0 STG.E.CG [C00y], c0;
+--:2:-:-:1  @P1 STG.E.CG [C04y], c1;
+--:3:-:-:1  @P2 STG.E.CG [C08y], c2;
+--:4:-:-:1  @P3 STG.E.CG [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/sgemm_common_128x64.sass b/Kernel/SGEMM/Maxwell/sgemm_common_128x64.sass
new file mode 100644
index 0000000..ee1705e
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_common_128x64.sass
@@ -0,0 +1,290 @@
+# sgemm_common_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 +  00>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 +  32>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 +  00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 +  32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:1:-:1      S2R tid_2,  SR_TID.X;
+--:-:2:-:1      S2R blockA, SR_CTAID.Y;
+--:-:3:-:1      S2R blockB, SR_CTAID.Z;
+--:-:4:-:1      S2R blockZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 64 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 4;
+
+// readCs = ((tid_2 & 96) << 3) | (tid_2 & 31)   << 2;
+01:-:-:-:1      LOP.AND tid31, tid_2, 31;
+01:-:-:-:1      LOP.AND tid96, tid_2, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 3;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx00 = blockB*64 + tid31;
+04:-:-:-:1      ISCADD cx00, blockB, tid31, 6;
+--:-:-:-:1      IADD   cx32, cx00, 32;
+
+// cy = blockA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+02:-:-:-:1      ISCADD  cy00, blockA, cy00, 7;
+
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00,   cx00, xmad_c;
+08:-:-:-:1      XMAD.LO2 ci, ldcz, blockZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;
+
+
+--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y0 + 4x<00>];
+--:-:2:-:1  @P1 LDG.E d1, [C00y0 + 4x<32>];
+--:-:3:-:1  @P2 LDG.E d2, [C04y0 + 4x<00>];
+--:-:4:-:1  @P3 LDG.E d3, [C04y0 + 4x<32>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c4, c4, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c5, c5, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c6, c6, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c7, c7, RZ, !PT;
+
+--:-:-:-:7      ISETP.NE.AND P6, PT, beta, RZ, PT;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], c4;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<0*64 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<1*64 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<1*64 + 32>];
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:1  @P6 FFMA c3, d3, beta, c3;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1  @P0 STG.E.CS [C00y0 + 4x<00>], c0;
+--:5:-:-:1  @P1 STG.E.CS [C00y0 + 4x<32>], c1;
+--:-:-:-:1  @P2 STG.E.CS [C04y0 + 4x<00>], c2;
+--:6:-:-:1  @P3 STG.E.CS [C04y0 + 4x<32>], c3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C08y0 + 4x<00>];
+--:-:2:-:1  @P1 LDG.E d1, [C08y0 + 4x<32>];
+--:-:3:-:1  @P2 LDG.E d2, [C12y0 + 4x<00>];
+--:-:4:-:1  @P3 LDG.E d3, [C12y0 + 4x<32>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+</SCHEDULE_BLOCK>
+
+10:-:-:-:2      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD   cy00, cy00, 1;
+--:-:-:-:1      IADD   cy04, cy04, 1;
+--:-:-:-:1      IADD   cy08, cy08, 1;
+--:-:-:-:1      IADD   cy12, cy12, 1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+20:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:0      IADD.X C04y1,    C04y1, RZ;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<2*64 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<3*64 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*64 + 32>];
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:1  @P6 FFMA c3, d3, beta, c3;
+
+--:-:-:-:1  @P0 STG.E.CS [C08y0 + 4x<00>], c0;
+--:5:-:-:1  @P1 STG.E.CS [C08y0 + 4x<32>], c1;
+--:-:-:-:1  @P2 STG.E.CS [C12y0 + 4x<00>], c2;
+--:6:-:-:1  @P3 STG.E.CS [C12y0 + 4x<32>], c3;
+
+10:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+20:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/sgemm_common_32x128.sass b/Kernel/SGEMM/Maxwell/sgemm_common_32x128.sass
new file mode 100644
index 0000000..da4d83d
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_common_32x128.sass
@@ -0,0 +1,234 @@
+# Kernel: hgemm_common_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32  + 16 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*32  + 16 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+    
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+    
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32  + 16 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 128 + readBs;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;
+
+// readCs = tid * 4;
+--:-:-:-:1      SHL readCs, tid, 2;
+
+// cx = blkB*128 + tid;
+--:-:-:-:1      ISCADD cx, blkB, tid, 7;
+
+// cy = blkA*32
+--:-:-:-:1      SHL cy00, blkA, 5;
+
+// C += (cy*ldc + cx) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;
+
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P4, RZ, flags, 2;
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc12, ldc, -ldc4, 6;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  12;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  12;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  12;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  12;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y];
+--:-:2:-:1  @P1 LDG.E d1, [C04y];
+--:-:3:-:1  @P2 LDG.E d2, [C08y];
+--:-:4:-:1  @P3 LDG.E d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:3      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*128>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*128>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*128>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*128>];
+</SCHEDULE_BLOCK>
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P5 FFMA c3, d3, beta, c3;
+
+--:1:-:-:1  @P0 STG.E.CG [C00y], c0;
+--:2:-:-:1  @P1 STG.E.CG [C04y], c1;
+--:3:-:-:1  @P2 STG.E.CG [C08y], c2;
+--:4:-:-:1  @P3 STG.E.CG [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_128x128.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_128x128.sass
new file mode 100644
index 0000000..22b8782
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_nn_128x128.sass
@@ -0,0 +1,327 @@
+# Kernel: sgemm_nn_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, xmad_ta, xmad_tb, tid31, tid128
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   ~ k<1-3>, x<1-3>
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-107  : loadA<0-7>, loadB<0-3>
+
+    108-111 : trackA<0-1>, trackB<0-1>
+
+    112-121 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop
+    122-127 ~ readAs, readBs, tid
+
+    64-75   ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-121  ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,   param_k;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      MOV loop, RZ;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+
+// tidAY  = (tid & 1) << 2
+01:-:-:-:1      LOP.AND tid1,  tid,  1;
+--:-:-:-:1      SHL     tidAY, tid1, 2;
+
+// tidAX = tid >> 1
+--:-:-:-:1      SHR.U32 tidAX, tid, 1;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ,  ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5) & 7
+--:-:-:-:1      LOP.AND tid31, tid,  31;
+--:-:-:-:1      SHL     tidBX, tid31, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,  0x305; // 3 bits at position 5
+
+// trackB += (blkB*128 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 7;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeAs = 4 * (128 * tidAY + tidAX)
+--:-:-:-:1      ISCADD  writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD  writeAs, writeAs, 4x<128*8*2>, 2;
+
+// writeBs = (128*tidBY + tidBX) * 4
+--:-:-:-:1      ISCADD  writeBs, tidBY, tidBX, 7;
+--:-:-:-:1      ISCADD  writeBs, writeBs, 4x<128*8*3>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+
+// k must be multiple of 8
+--:-:1:-:1  @P6 LDG.E.CI.128 loadB0, [trackB];
+
+--:-:2:-:1  @P5 LDG.E.CI.128 loadA0, [trackA + 4x<0>];
+--:5:6:-:1  @P5 LDG.E.CI.128 loadA4, [trackA + 4x<8>];
+
+--:-:3:-:1 @!P6 LDS.U.128 loadB0, [addr_zero];
+--:-:4:-:1 @!P5 LDS.U.128 loadA0, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.128 loadA4, [addr_zero];
+
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+05:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+0a:-:-:-:1      STS [writeAs + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeAs + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*128>], loadA3;
+
+10:-:-:-:6      IADD   trackA0.CC, trackA0, 4x<16>;
+--:-:-:-:1      IADD.X trackA1, trackA1, RZ;
+
+    } : q{
+
+<SCHEDULE_BLOCK>
+
+// doLoad0 = tidBY < k
+--:-:-:-:1      IADD x1, txb, 1;
+--:-:-:-:1      IADD x2, txb, 2;
+--:-:-:-:1      IADD x3, txb, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_n, P0;
+
+--:-:6:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+
+--:-:-:-:1      IADD k1, tidAY, 1;
+--:-:-:-:1      IADD k2, tidAY, 2;
+--:-:-:-:1      IADD k3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:2:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+</SCHEDULE_BLOCK>
+
+// bDoRemainder = k > 8
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+20:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128>], loadA0;
+04:-:-:-:1      STS [writeAs + 4x<1*128>], loadA1;
+08:-:-:-:1      STS [writeAs + 4x<2*128>], loadA2;
+10:-:-:-:1      STS [writeAs + 4x<3*128>], loadA3;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, 4x<8>;
+--:-:-:-:1      IADD.X trackA1, trackA1, RZ;
+    };
+</CODE>
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      LOP.XOR writeAs, writeAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR writeBs, writeBs, 4x<128*8*2>;
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND  P0, PT, k, $k_end, PT;\n",
+        j0c15 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P5;\n",
+
+        j0c10 => "--:-:2:-:1  \@P3 LDG.E.CI.128 loadB0, [trackB];\n",
+
+        j0c28 => "--:-:3:-:1  \@P2 LDG.E.CI.128 loadA0, [trackA + 4x<0>];\n",
+        j0c30 => "20:5:6:-:1  \@P2 LDG.E.CI.128 loadA4, [trackA + 4x<8>];\n",
+
+        j4c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128>], loadA4;\n",
+        j4c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128>], loadA5;\n",
+        j4c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128>], loadA6;\n",
+        j4c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128>], loadA7;\n",
+
+        j5c35 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c29 => "04:-:-:-:1  \@P1 STS [writeAs + 4x<0*128>], loadA0;\n",
+        j6c31 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j6c33 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j6c35 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<3*128>], loadA3;\n",
+
+        j6c46 => "10:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 4x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+        j0c8  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        j0c10 => "--:-:6:-:1  \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+        j0c29 => "--:-:6:-:1  \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+        j0c31 => "--:-:6:-:1  \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+        j0c33 => "--:-:6:-:1  \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+
+        j0c35 => "--:-:2:-:1  \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+        j1c29 => "--:-:3:-:1  \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+        j1c31 => "--:-:4:-:1  \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+        j1c33 => "--:-:5:-:1  \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j5c39 => "20:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeAs + 4x<0*128>], loadA0;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS [writeAs + 4x<3*128>], loadA3;\n",
+
+        j6c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 4x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+            )
+        ),
+
+        j5c46 => "--:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j5c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:0      IADD32I k, k, -8;\n" .
+                 "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_128x32.sass
new file mode 100644
index 0000000..8194777
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_nn_128x32.sass
@@ -0,0 +1,485 @@
+# Kernel: sgemm_nn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + 32*16*2>
+    szShareA  : (128*16 + 32)
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, shiftAX
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+--:-:-:-:1      SHL lda32, lda, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD txa00, blkA, tidAX, 7;
+--:-:-:-:1      IADD   txa32, txa00, 32;
+--:-:-:-:1      IADD   txa64, txa00, 64;
+--:-:-:-:1      IADD   txa96, txa00, 96;
+
+--:-:-:-:1      XMAD.LO  ta00, lda,  txa00,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta00, ldaz, blkZ,  ta00;
+--:-:-:-:1      IADD     ta32, ta00, lda32;
+--:-:-:-:1      IADD     ta64, ta32, lda32;
+--:-:-:-:1      IADD     ta96, ta64, lda32;
+
+--:-:-:-:1      LEA      track0A0.CC, ta00, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta00, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, ta32, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track1A1,    ta32, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track2A0.CC, ta64, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track2A1,    ta64, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track3A0.CC, ta96, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track3A1,    ta96, param_A[1], RZ, 2;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 2;
+
+// writeAs = (tidAY*128 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:2:-:1  @P3 LDG.E.CI.128 load1A, [track1A];
+--:-:3:-:1  @P4 LDG.E.CI.128 load2A, [track2A];
+--:-:4:-:1  @P5 LDG.E.CI.128 load3A, [track3A];
+--:-:5:-:1  @P6 LDG.E.CI.128 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load1A, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128 load2A, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 load3A, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.128 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI load0A0, [track0A + 4x<0>];
+--:-:1:-:1  @P1 LDG.E.CI load0A1, [track0A + 4x<1>];
+--:-:1:-:1  @P2 LDG.E.CI load0A2, [track0A + 4x<2>];
+--:-:1:-:1  @P3 LDG.E.CI load0A3, [track0A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI load1A0, [track1A + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI load1A1, [track1A + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI load1A2, [track1A + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI load1A3, [track1A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI load2A0, [track2A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI load2A1, [track2A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load2A2, [track2A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI load2A3, [track2A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI load3A0, [track3A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI load3A1, [track3A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI load3A2, [track3A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load3A3, [track3A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:5:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:5:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P1, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS [writeAs + 4x<0*128 + 0*32>], load0A0;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 0*32>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 0*32>], load0A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 0*32>], load0A3;
+
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128 + 1*32>], load1A0;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 1*32>], load1A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 1*32>], load1A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 1*32>], load1A3;
+
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 2*32>], load2A0;
+--:-:-:-:0      IADD   track2A0.CC, track2A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 2*32>], load2A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 2*32>], load2A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 2*32>], load2A3;
+
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 3*32>], load3A0;
+--:-:-:-:0      IADD   track3A0.CC, track3A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 3*32>], load3A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 3*32>], load3A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 3*32>], load3A3;
+
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:4:-:1  @P3 LDG.E.CI.128 load1A, [track1A];
+--:-:5:-:1  @P4 LDG.E.CI.128 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.CI.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.128 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI load0A0, [track0A + 4x<0>];
+--:-:3:-:1  @P2 LDG.E.CI load0A1, [track0A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load0A2, [track0A + 4x<2>];
+--:-:3:-:1  @P2 LDG.E.CI load0A3, [track0A + 4x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI load1A0, [track1A + 4x<0>];
+--:-:4:-:1  @P3 LDG.E.CI load1A1, [track1A + 4x<1>];
+--:-:4:-:1  @P3 LDG.E.CI load1A2, [track1A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load1A3, [track1A + 4x<3>];
+
+--:-:5:-:1  @P4 LDG.E.CI load2A0, [track2A + 4x<0>];
+--:-:5:-:1  @P4 LDG.E.CI load2A1, [track2A + 4x<1>];
+--:-:5:-:1  @P4 LDG.E.CI load2A2, [track2A + 4x<2>];
+--:-:5:-:1  @P4 LDG.E.CI load2A3, [track2A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load3A0, [track3A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load3A1, [track3A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load3A2, [track3A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load3A3, [track3A + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P6 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P6 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P6 LDG.E.CI loadB3, [trackB + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n",
+
+        j9c6   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n",
+
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 4x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 4x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2A0.CC, track2A0, 4x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, 4x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.128 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI.128 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.128 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.128 loadB,  [trackB];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI load2A0, [track2A + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P4 LDG.E.CI load2A1, [track2A + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P4 LDG.E.CI load2A2, [track2A + 4x<2>];\n",
+                j10c3  => "--:-:5:-:1  \@P4 LDG.E.CI load2A3, [track2A + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_128x64.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_128x64.sass
new file mode 100644
index 0000000..2fca939
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_nn_128x64.sass
@@ -0,0 +1,414 @@
+# Kernel: sgemm_nn_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tid, blkA, blkB, blkZ, txb, tidAY, tidBY, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, ta, xmad_ta, tb, tid15, xmad_tb, k<1-3>, x<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-115  : loadAA<0-7>, loadA<0-7>, loadB<0-3>
+
+    116-121 : track0A<0-1>, track1A<0-1>, trackB<0-1>
+
+    122-125 ~ writeAs, writeBs, k, swapBuf
+    126-127 ~ readAs, readBs
+
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-125  ~ tid_2, blockA, blockB, blockZ, ldc, ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, ci, xmad_c, alpha, beta, flags, tid31, tid96
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,   param_k;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tidAY = (tid & 1) << 2
+// tidAX = tid >> 1
+01:-:-:-:1      LOP.AND tid1,  tid,  1;
+--:-:-:-:1      SHL     tidAY, tid1, 2;
+01:-:-:-:1      SHR.U32 tidAX, tid,  1;
+
+// trackA += 4 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD  txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ,  ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, lda, track0A0,      8;
+--:-:-:-:1      LEA.HI.X track1A1,    lda, track0A1, RZ,  8;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa, param_m, PT;
+--:-:-:-:1      IADD txa, txa, 64;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// tidBX = (tid & 15) << 2
+// tidBY = (tid >> 4) & 7
+--:-:-:-:1      LOP.AND tid15, tid,   15;
+--:-:-:-:1      SHL     tidBX, tid15, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,   0x304; // 3 bits at position 4
+
+// trackB += (blkB*64 + tidX + ldb*tidBY) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x2;
+--:-:-:-:2      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// Start the write buffers high
+// writeAs = (128*tidAY + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2;
+
+// writeBs = (64*tidBY + tidX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2;
+
+// Start the read buffers low
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<64*8 + 128*8>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+
+// k must be multiple of 8
+--:-:2:-:1  @P6 LDG.E.CI.128 loadB0, [trackB];
+
+--:-:3:-:1  @P4 LDG.E.CI.128 loadA0,  [track0A + 4x<0>];
+--:-:3:-:1  @P4 LDG.E.CI.128 loadAA0, [track0A + 4x<8>];
+
+--:-:4:-:1  @P5 LDG.E.CI.128 loadA4,  [track1A + 4x<0>];
+--:-:4:-:1  @P5 LDG.E.CI.128 loadAA4, [track1A + 4x<8>];
+
+--:-:-:-:1 @!P6 LDS.U.128 loadB0, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128 loadA0, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 loadA4, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128 loadAA0, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.128 loadAA4, [addr_zero];
+
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+22:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 00>], loadA0;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 00>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 00>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*128 + 00>], loadA3;
+
+--:-:-:-:6      IADD   track0A0.CC, track0A0, 4x<16>;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 64>], loadA4;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 64>], loadA5;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 64>], loadA6;
+--:-:-:-:1      STS [writeAs + 4x<3*128 + 64>], loadA7;
+
+--:-:-:-:6      IADD   track1A0.CC, track1A0, 4x<16>;
+--:-:-:-:1      IADD.X track1A1,    track1A1, RZ;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+    } : q{
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+<SCHEDULE_BLOCK>
+01:-:-:-:1      LOP.AND tid1,  tid,   1;
+--:-:-:-:1      SHL     tidAY, tid1,  2;
+--:-:-:-:1      LOP.AND tid15, tid,   15;
+--:-:-:-:1      SHL     tidBX, tid15, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,   0x304; // 3 bits at position 4
+02:-:-:-:1      ISCADD  txb, blkB, tidBX, 6;
+
+// doLoad0 = tidBY < k
+--:-:-:-:1      IADD x1, txb, 1;
+--:-:-:-:1      IADD x2, txb, 2;
+--:-:-:-:1      IADD x3, txb, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_n, P0;
+
+--:-:2:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      IADD k1, tidAY, 1;
+--:-:-:-:1      IADD k2, tidAY, 2;
+--:-:-:-:1      IADD k3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P4;
+
+--:-:3:-:1  @P0 LDG.E.CI loadA0, [track0A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI loadA1, [track0A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI loadA2, [track0A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI loadA3, [track0A + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:4:-:1  @P0 LDG.E.CI loadA4, [track1A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI loadA5, [track1A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI loadA6, [track1A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI loadA7, [track1A + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA4, RZ;
+--:-:-:-:1 @!P1 MOV loadA5, RZ;
+--:-:-:-:1 @!P2 MOV loadA6, RZ;
+--:-:-:-:1 @!P3 MOV loadA7, RZ;
+</SCHEDULE_BLOCK>
+
+02:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 00>], loadA0;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 00>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 00>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*128 + 00>], loadA3;
+
+--:-:-:-:6      IADD   track0A0.CC, track0A0, 4x<8>;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 64>], loadA4;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 64>], loadA5;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 64>], loadA6;
+--:-:-:-:1      STS [writeAs + 4x<3*128 + 64>], loadA7;
+
+--:-:-:-:6      IADD   track1A0.CC, track1A0, 4x<8>;
+--:-:-:-:1      IADD.X track1A1,    track1A1, RZ;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, P6;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+
+        j0c11 => "--:-:2:-:1  \@P0 LDG.E.CI.128 loadB0, [trackB];\n",
+
+        j0c12 => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j0c13 => "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j0c23 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P4;\n",
+        j0c24 => "--:-:-:-:1      PSETP.AND.AND P3, PT, P0, P1, P5;\n",
+
+        j0c35 => "--:-:3:-:1  \@P2 LDG.E.CI.128 loadA0,  [track0A + 4x<0>];\n",
+        j0c37 => "--:-:3:-:1  \@P2 LDG.E.CI.128 loadAA0, [track0A + 4x<8>];\n",
+
+        j0c39 => "--:-:4:-:1  \@P3 LDG.E.CI.128 loadA4,  [track1A + 4x<0>];\n",
+        j0c41 => "10:6:5:-:1  \@P3 LDG.E.CI.128 loadAA4, [track1A + 4x<8>];\n",
+
+        j2c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 00>], loadAA0;\n",
+        j2c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 00>], loadAA1;\n",
+        j2c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 00>], loadAA2;\n",
+        j2c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 00>], loadAA3;\n",
+
+        j3c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 64>], loadAA4;\n",
+        j3c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 64>], loadAA5;\n",
+        j3c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 64>], loadAA6;\n",
+        j3c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 64>], loadAA7;\n",
+
+        j5c29 => "04:-:-:-:1  \@P1 STS [writeAs + 4x<0*128 + 00>], loadA0;\n",
+        j5c31 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<1*128 + 00>], loadA1;\n",
+        j5c33 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<2*128 + 00>], loadA2;\n",
+        j5c35 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<3*128 + 00>], loadA3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 4x<16>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+
+        j6c29 => "08:-:-:-:1  \@P1 STS [writeAs + 4x<0*128 + 64>], loadA4;\n",
+        j6c31 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<1*128 + 64>], loadA5;\n",
+        j6c33 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<2*128 + 64>], loadA6;\n",
+        j6c35 => "--:2:-:-:1  \@P1 STS [writeAs + 4x<3*128 + 64>], loadA7;\n",
+
+        j6c46 => "20:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 4x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P4;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P5;\n",
+
+        j0c10 => "--:-:2:-:1  \@P0 LDG.E.CS loadB0, [trackB + 4x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P0 LDG.E.CS loadB1, [trackB + 4x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P0 LDG.E.CS loadB2, [trackB + 4x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P0 LDG.E.CS loadB3, [trackB + 4x<3>];\n",
+
+        j0c18 => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j0c20 => "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j0c33 => "--:-:3:-:1  \@P2 LDG.E.CI loadA0, [track0A + 4x<0>];\n",
+        j0c35 => "--:-:3:-:1  \@P2 LDG.E.CI loadA1, [track0A + 4x<1>];\n",
+        j0c37 => "--:-:3:-:1  \@P2 LDG.E.CI loadA2, [track0A + 4x<2>];\n",
+        j0c39 => "--:-:3:-:1  \@P2 LDG.E.CI loadA3, [track0A + 4x<3>];\n",
+
+        j1c29 => "--:-:4:-:1  \@P3 LDG.E.CI loadA4, [track1A + 4x<0>];\n",
+        j1c31 => "--:-:4:-:1  \@P3 LDG.E.CI loadA5, [track1A + 4x<1>];\n",
+        j1c33 => "--:-:4:-:1  \@P3 LDG.E.CI loadA6, [track1A + 4x<2>];\n",
+        j1c35 => "--:-:4:-:1  \@P3 LDG.E.CI loadA7, [track1A + 4x<3>];\n",
+
+        j5c29 => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 00>], loadA0;\n",
+        j5c31 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 00>], loadA1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 00>], loadA2;\n",
+        j5c35 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 00>], loadA3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   track0A0.CC, track0A0, 4x<8>;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X track0A1,    track0A1, RZ;\n",
+
+        j6c29 => "08:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 64>], loadA4;\n",
+        j6c31 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 64>], loadA5;\n",
+        j6c33 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 64>], loadA6;\n",
+        j6c35 => "--:2:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 64>], loadA7;\n",
+
+        j6c46 => "--:-:-:-:1  \@P0 IADD   track1A0.CC, track1A0, 4x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P0 IADD.X track1A1,    track1A1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+            )
+        ),
+
+        j4c21 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j4c22 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j4c27 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "02:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x64.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_32x128.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_32x128.sass
new file mode 100644
index 0000000..e25c3a9
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_nn_32x128.sass
@@ -0,0 +1,458 @@
+# Kernel: sgemm_nn_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*16*2 + (32*16 + 32)*2>
+    szShareA : (32*16 + 32)
+    szShareB : (128*16)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ tidAX, tidBX, lda, ldb, ldb4, ldaz, ldbz, tid1, tid3, tid96, ta, tb0, tb1, tb2, tb3, xmad_ta, xmad_tb, shiftAX, tidAY<1-3>, tidBY<1-3>, txb<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadA<0-3>
+      84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>
+
+    100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+--:-:3:-:1      S2R blkA, SR_CTAID.Y;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb4,  ldb, 2;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidBX, tid,   31;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   5;
+
+// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 4
+04:-:-:-:1      ISCADD   txa, blkA, tidAX, 5;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 2;
+
+// trackB += (blkB*128 + tidBX + ldb*tidBY) * 4
+02:-:-:-:1      ISCADD   txb, blkB, tidBX, 7;
+--:-:-:-:1      XMAD.LO2 tb0, ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb0, ldbz, blkZ,  tb0;
+--:-:-:-:1      IADD     tb1, tb0, ldb4;
+--:-:-:-:1      IADD     tb2, tb1, ldb4;
+--:-:-:-:1      IADD     tb3, tb2, ldb4;
+
+--:-:-:-:1      LEA      track0B0.CC, tb0, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track0B1,    tb0, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track1B0.CC, tb1, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track1B1,    tb1, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track2B0.CC, tb2, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track2B1,    tb2, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track3B0.CC, tb3, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track3B1,    tb3, param_B[1], RZ, 2;
+
+// writeAs = (tidAY*32 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*128 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 7;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+01:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readBs, readBs, tid96;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidBY1, tidBY, 4;
+--:-:-:-:1      IADD tidBY2, tidBY, 8;
+--:-:-:-:1      IADD tidBY3, tidBY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidBY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.128 load0B, [track0B];
+--:-:2:-:1  @P1 LDG.E.CI.128 load1B, [track1B];
+--:-:3:-:1  @P2 LDG.E.CI.128 load2B, [track2B];
+--:-:4:-:1  @P3 LDG.E.CI.128 load3B, [track3B];
+--:-:5:-:1  @P4 LDG.E.CI.128 loadA,  [trackA];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.128 load0B, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 load1B, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.128 load2B, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load3B, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128 loadA,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI load0B0, [track0B + 4x<0>];
+--:-:1:-:1  @P1 LDG.E.CI load0B1, [track0B + 4x<1>];
+--:-:1:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
+--:-:1:-:1  @P3 LDG.E.CI load0B3, [track0B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI load1B0, [track1B + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI load1B1, [track1B + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI load1B2, [track1B + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI load2B0, [track2B + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI load2B1, [track2B + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load2B2, [track2B + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI load2B3, [track2B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI load3B0, [track3B + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI load3B1, [track3B + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI load3B2, [track3B + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load3B3, [track3B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:5:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:5:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P0, RZ, k, 15;
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 16, P0;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS.128 [writeBs + 4x<0*128>], load0B;
+--:-:-:-:6      IADD   track0B0.CC, track0B0, ldb16;
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+02:-:-:-:1      STS.128 [writeBs + 4x<4*128>], load1B;
+--:-:-:-:6      IADD   track1B0.CC, track1B0, ldb16;
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+04:-:-:-:1      STS.128 [writeBs + 4x<8*128>], load2B;
+--:-:-:-:6      IADD   track2B0.CC, track2B0, ldb16;
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs + 4x<12*128>], load3B;
+--:-:-:-:6      IADD   track3B0.CC, track3B0, ldb16;
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+10:-:-:-:1      STS [writeAs + 4x<0*32>], loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*32>], loadA3;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.128 load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.128 load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load2B, [track2B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load3B, [track3B];
+--:-:6:-:1  @P6 LDG.E.CI.128 loadA,  [trackA];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI load0B0, [track0B + 4x<0>];
+--:-:3:-:1  @P2 LDG.E.CI load0B1, [track0B + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
+--:-:3:-:1  @P2 LDG.E.CI load0B3, [track0B + 4x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI load1B0, [track1B + 4x<0>];
+--:-:4:-:1  @P3 LDG.E.CI load1B1, [track1B + 4x<1>];
+--:-:4:-:1  @P3 LDG.E.CI load1B2, [track1B + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load2B0, [track2B + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load2B1, [track2B + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load2B2, [track2B + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load2B3, [track2B + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load3B0, [track3B + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load3B1, [track3B + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load3B2, [track3B + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load3B3, [track3B + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI loadA0,  [trackA + 4x<0>];
+--:-:6:-:1  @P6 LDG.E.CI loadA1,  [trackA + 4x<1>];
+--:-:6:-:1  @P6 LDG.E.CI loadA2,  [trackA + 4x<2>];
+--:-:6:-:1  @P6 LDG.E.CI loadA3,  [trackA + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*128>], load0B;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeBs + 4x< 4*128>], load1B;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*128>], load2B;\n",
+        j9c6   => "--:5:-:-:1  \@P0 STS.128 [writeBs + 4x<12*128>], load3B;\n",
+        j11c6  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<0*32>], loadA0;\n",
+        j11c8  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], loadA1;\n",
+        j11c10 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], loadA2;\n",
+        j11c12 => "--:6:-:-:1  \@P0 STS [writeAs + 4x<3*32>], loadA3;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0B0.CC, track0B0, ldb16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0B1,    track0B1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2B0.CC, track2B0, ldb16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2B1,    track2B1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3B0.CC, track3B0, ldb16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3B1,    track3B1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackA0.CC,  trackA0, 4x<16>;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackA1,     trackA1, RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.128 load0B, [track0B];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.128 load1B, [track1B];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.128 load2B, [track2B];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.128 load3B, [track3B];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.128 loadA,  [trackA];\n",
+            ) :
+            (
+
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI load0B0, [track0B + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI load0B1, [track0B + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI load0B2, [track0B + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI load0B3, [track0B + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI load1B0, [track1B + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI load1B1, [track1B + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI load1B2, [track1B + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI load1B3, [track1B + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI load2B0, [track2B + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E.CI load2B1, [track2B + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E.CI load2B2, [track2B + 4x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E.CI load2B3, [track2B + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI load3B0, [track3B + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI load3B1, [track3B + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI load3B2, [track3B + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI load3B3, [track3B + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_common_32x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_nn_rnn_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_nn_rnn_128x32.sass
new file mode 100644
index 0000000..21b493d
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_nn_rnn_128x32.sass
@@ -0,0 +1,512 @@
+# Kernel: sgemm_nn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + 32*16*2>
+    szShareA  : (128*16 + 32)
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]        : c[0x0][0x140]
+    param_C[1]        : c[0x0][0x144]
+    param_A[0]        : c[0x0][0x148]
+    param_A[1]        : c[0x0][0x14c]
+    param_B[0]        : c[0x0][0x150]
+    param_B[1]        : c[0x0][0x154]
+    param_bias[0]     : c[0x0][0x158]
+    param_bias[1]     : c[0x0][0x15c]
+    param_lockAddr[0] : c[0x0][0x160]
+    param_lockAddr[1] : c[0x0][0x164]
+    param_alpha       : c[0x0][0x168]
+    param_beta        : c[0x0][0x16c]
+    param_xcutoff     : c[0x0][0x170]
+    param_flags       : c[0x0][0x174]
+    param_lda         : c[0x0][0x178]
+    param_ldb8        : c[0x0][0x17c]
+    param_ldc         : c[0x0][0x180]
+    param_m           : c[0x0][0x184]
+    param_n           : c[0x0][0x188]
+    param_k           : c[0x0][0x18c]
+    param_ldaz        : c[0x0][0x190]
+    param_ldbz        : c[0x0][0x194]
+    param_ldcz        : c[0x0][0x198]
+    param_loops       : c[0x0][0x19c]
+    param_dimB        : c[0x0][0x1a0]
+    param_dimC        : c[0x0][0x1a4]
+    param_unrolling   : c[0x0][0x1a8]
+    param_numBlks     : c[0x0][0x1ac]
+    param_numAblks    : c[0x0][0x1b0]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, offsetB, shiftAX
+    80-81 : baseB<0-1>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, time_step
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+    48-61 : bias00y<0-1>, bias04y<0-1>, bias08y<0-1>, bias12y<0-1>, b0, b1, b2, b3, baseC<0-1>
+    62-66 : blkId, nextBlk, lockAddr<0-1>, lockVal
+   67-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags, xcutoff, offsetC, numBlk
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+
+--:-:-:-:1      MOV time_step, RZ;
+--:-:-:-:1      MOV flags, param_flags;
+
+RNN_LOOP:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+--:-:-:-:1      SHL lda32, lda, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+--:-:-:-:6      LOP.AND.NZ   P0, RZ, flags, 4;
+--:-:-:-:6  @P0 IADD offsetB, -time_step, param_unrolling;
+--:-:-:-:6  @P0 IADD offsetB, offsetB,    -1;
+--:-:-:-:6 @!P0 MOV  offsetB, time_step;
+
+// baseB = param_B + dimB * time_step
+--:-:-:-:1      XMAD     offsetB,   offsetB,   param_dimB, RZ;
+--:-:-:-:1      LEA      baseB0.CC, offsetB,   param_B[0],     2;
+--:-:-:-:1      LEA.HI.X baseB1,    offsetB,   param_B[1], RZ, 2;
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD txa00, blkA, tidAX, 7;
+--:-:-:-:1      IADD   txa32, txa00, 32;
+--:-:-:-:1      IADD   txa64, txa00, 64;
+--:-:-:-:1      IADD   txa96, txa00, 96;
+
+--:-:-:-:1      XMAD.LO  ta00, lda,  txa00,   tidAY, xmad_ta;
+--:-:-:-:1      XMAD.LO2 ta00, ldaz, RZ,      ta00;
+--:-:-:-:1      IADD     ta32, ta00, lda32;
+--:-:-:-:1      IADD     ta64, ta32, lda32;
+--:-:-:-:1      IADD     ta96, ta64, lda32;
+
+--:-:-:-:1      LEA      track0A0.CC, ta00, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta00, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, ta32, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track1A1,    ta32, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track2A0.CC, ta64, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track2A1,    ta64, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track3A0.CC, ta96, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track3A1,    ta96, param_A[1], RZ, 2;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+--:-:-:-:1      XMAD.LO2 tb,  ldbz, RZ,    tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, baseB0,     2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, baseB1, RZ, 2;
+
+// writeAs = (tidAY*128 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.128 load0A, [track0A];
+--:-:2:-:1  @P3 LDG.E.128 load1A, [track1A];
+--:-:3:-:1  @P4 LDG.E.128 load2A, [track2A];
+--:-:4:-:1  @P5 LDG.E.128 load3A, [track3A];
+--:-:5:-:1  @P6 LDG.E.128 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load1A, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128 load2A, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 load3A, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.128 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E load0A0, [track0A + 4x<0>];
+--:-:1:-:1  @P1 LDG.E load0A1, [track0A + 4x<1>];
+--:-:1:-:1  @P2 LDG.E load0A2, [track0A + 4x<2>];
+--:-:1:-:1  @P3 LDG.E load0A3, [track0A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E load1A0, [track1A + 4x<0>];
+--:-:2:-:1  @P1 LDG.E load1A1, [track1A + 4x<1>];
+--:-:2:-:1  @P2 LDG.E load1A2, [track1A + 4x<2>];
+--:-:2:-:1  @P3 LDG.E load1A3, [track1A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E load2A0, [track2A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E load2A1, [track2A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E load2A2, [track2A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E load2A3, [track2A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E load3A0, [track3A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E load3A1, [track3A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E load3A2, [track3A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E load3A3, [track3A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E loadB0, [trackB + 4x<0>];
+--:-:5:-:1  @P1 LDG.E loadB1, [trackB + 4x<1>];
+--:-:5:-:1  @P2 LDG.E loadB2, [trackB + 4x<2>];
+--:-:5:-:1  @P3 LDG.E loadB3, [trackB + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P1, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS [writeAs + 4x<0*128 + 0*32>], load0A0;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 0*32>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 0*32>], load0A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 0*32>], load0A3;
+
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128 + 1*32>], load1A0;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 1*32>], load1A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 1*32>], load1A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 1*32>], load1A3;
+
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 2*32>], load2A0;
+--:-:-:-:0      IADD   track2A0.CC, track2A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 2*32>], load2A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 2*32>], load2A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 2*32>], load2A3;
+
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 3*32>], load3A0;
+--:-:-:-:0      IADD   track3A0.CC, track3A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 3*32>], load3A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 3*32>], load3A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 3*32>], load3A3;
+
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.128 load0A, [track0A];
+--:-:4:-:1  @P3 LDG.E.128 load1A, [track1A];
+--:-:5:-:1  @P4 LDG.E.128 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.128 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P2 LDG.E load0A0, [track0A + 4x<0>];
+--:-:3:-:1  @P2 LDG.E load0A1, [track0A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E load0A2, [track0A + 4x<2>];
+--:-:3:-:1  @P2 LDG.E load0A3, [track0A + 4x<3>];
+
+--:-:4:-:1  @P3 LDG.E load1A0, [track1A + 4x<0>];
+--:-:4:-:1  @P3 LDG.E load1A1, [track1A + 4x<1>];
+--:-:4:-:1  @P3 LDG.E load1A2, [track1A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E load1A3, [track1A + 4x<3>];
+
+--:-:5:-:1  @P4 LDG.E load2A0, [track2A + 4x<0>];
+--:-:5:-:1  @P4 LDG.E load2A1, [track2A + 4x<1>];
+--:-:5:-:1  @P4 LDG.E load2A2, [track2A + 4x<2>];
+--:-:5:-:1  @P4 LDG.E load2A3, [track2A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E load3A0, [track3A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E load3A1, [track3A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E load3A2, [track3A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E load3A3, [track3A + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P6 LDG.E loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P6 LDG.E loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P6 LDG.E loadB3, [trackB + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n",
+
+        j9c6   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n",
+
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 4x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 4x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2A0.CC, track2A0, 4x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, 4x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.128 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.128 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.128 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.128 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.128 loadB,  [trackB];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E load0A0, [track0A + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E load0A1, [track0A + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E load0A2, [track0A + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E load0A3, [track0A + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E load1A0, [track1A + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E load1A1, [track1A + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E load1A2, [track1A + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E load1A3, [track1A + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E load2A0, [track2A + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P4 LDG.E load2A1, [track2A + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P4 LDG.E load2A2, [track2A + 4x<2>];\n",
+                j10c3  => "--:-:5:-:1  \@P4 LDG.E load2A3, [track2A + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E load3A0, [track3A + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E load3A1, [track3A + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E load3A2, [track3A + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E load3A3, [track3A + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E loadB0, [trackB + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E loadB1, [trackB + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E loadB2, [trackB + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_rnn_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_nt_128x128.sass b/Kernel/SGEMM/Maxwell/sgemm_nt_128x128.sass
new file mode 100644
index 0000000..e01b4b5
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_nt_128x128.sass
@@ -0,0 +1,339 @@
+# Kernel: sgemm_nt_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ blkA, blkB, blkZ, tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, tid127, txa, txb, xmad_ta, xmad_tb, tid128
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-67   ~ k1, k2, k3
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-111  : loadA<0-7>,  loadB<0-7>
+    112-115 : trackA<0-1>, trackB<0-1>
+
+    116-121 ~ writeS, k, tidY, ta, tb, loop
+    122-127 ~ readAs, readBs, tid
+
+    64-75   ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-121  ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,  param_k;
+--:-:-:-:1      LOP.AND tid1, tid,  1;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+--:-:-:-:1      MOV loop, RZ;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// tidY  = tid1 << 2
+--:-:-:-:1      SHL tidY, tid1, 2;
+
+// tidX = tid >> 1
+01:-:-:-:1      SHR.U32 tidX, tid, 1;
+
+// trackA += 4 * ((blkA*128 + tidX) * lda + tidY)
+02:-:-:-:1      ISCADD   txa, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0], 0x2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x2;
+
+// trackB += 4 * ((blkB*128 + tidX) * ldb + tidY)
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 7;
+--:-:-:-:1      XMAD.LO  tb,  ldb,  txb,  tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0], 0x2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = 4 * (128 * tidY + tidX)
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+
+// k must be multiple of 8
+--:-:2:-:1  @P5 LDG.E.CI.128 loadA0, [trackA + 4x<0>];
+--:-:2:-:1  @P5 LDG.E.CI.128 loadA4, [trackA + 4x<8>];
+
+--:-:3:-:1  @P6 LDG.E.CI.128 loadB0, [trackB + 4x<0>];
+--:5:4:-:1  @P6 LDG.E.CI.128 loadB4, [trackB + 4x<8>];
+
+--:-:-:-:1 @!P5 LDS.U.128 loadA0, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.128 loadB0, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.128 loadA4, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.128 loadB4, [addr_zero];
+
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+22:-:-:-:1      STS [writeS + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeS + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeS + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeS + 4x<3*128>], loadA3;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, 4x<16>;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+24:-:-:-:1      STS [writeS + 4x< 8*128>], loadB0;
+--:-:-:-:1      STS [writeS + 4x< 9*128>], loadB1;
+--:-:-:-:1      STS [writeS + 4x<10*128>], loadB2;
+--:-:-:-:1      STS [writeS + 4x<11*128>], loadB3;
+
+10:-:-:-:6      IADD   trackB0.CC, trackB0, 4x<16>;
+--:-:-:-:1      IADD.X trackB1, trackB1, RZ;
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+    } : q{
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD k1, tidY, 1;
+--:-:-:-:1      IADD k2, tidY, 2;
+--:-:-:-:1      IADD k3, tidY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:2:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P6;
+
+--:-:3:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+</SCHEDULE_BLOCK>
+
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 7;
+
+02:-:-:-:1      STS [writeS + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeS + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeS + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeS + 4x<3*128>], loadA3;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, 4x<8>;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+04:-:-:-:1      STS [writeS + 4x< 8*128>], loadB0;
+--:-:-:-:1      STS [writeS + 4x< 9*128>], loadB1;
+--:-:-:-:1      STS [writeS + 4x<10*128>], loadB2;
+--:-:-:-:1      STS [writeS + 4x<11*128>], loadB3;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, 4x<8>;
+--:-:-:-:1      IADD.X trackB1, trackB1, RZ;
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $vec;
+    our @top = $vec ?
+        ("--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n") :
+        ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, 16, P5;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+        j0c13 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P5;\n",
+        j0c14 => "--:-:-:-:1      PSETP.AND.AND P3, PT, P0, P1, P6;\n",
+
+        j0c27 => "--:-:2:-:1  \@P2 LDG.E.CI.128 loadA0, [trackA + 4x<0>];\n",
+        j0c29 => "--:-:2:-:1  \@P2 LDG.E.CI.128 loadA4, [trackA + 4x<8>];\n",
+
+        j0c31 => "--:-:3:-:1  \@P3 LDG.E.CI.128 loadB0, [trackB + 4x<0>];\n",
+        j0c33 => "08:5:4:-:1  \@P3 LDG.E.CI.128 loadB4, [trackB + 4x<8>];\n",
+
+        j3c29 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<0*128>], loadA4;\n",
+        j3c31 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<1*128>], loadA5;\n",
+        j3c33 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<2*128>], loadA6;\n",
+        j3c35 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<3*128>], loadA7;\n",
+
+        j4c29 => "--:-:-:-:1 \@!P1 STS [writeS + 4x< 8*128>], loadB4;\n",
+        j4c31 => "--:-:-:-:1 \@!P1 STS [writeS + 4x< 9*128>], loadB5;\n",
+        j4c33 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<10*128>], loadB6;\n",
+        j4c35 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<11*128>], loadB7;\n",
+
+        j5c29 => "02:-:-:-:1  \@P1 STS [writeS + 4x<0*128>], loadA0;\n",
+        j5c31 => "--:-:-:-:1  \@P1 STS [writeS + 4x<1*128>], loadA1;\n",
+        j5c33 => "--:-:-:-:1  \@P1 STS [writeS + 4x<2*128>], loadA2;\n",
+        j5c35 => "--:-:-:-:1  \@P1 STS [writeS + 4x<3*128>], loadA3;\n",
+
+        j6c29 => "04:-:-:-:1  \@P1 STS [writeS + 4x< 8*128>], loadB0;\n",
+        j6c31 => "--:-:-:-:1  \@P1 STS [writeS + 4x< 9*128>], loadB1;\n",
+        j6c33 => "--:-:-:-:1  \@P1 STS [writeS + 4x<10*128>], loadB2;\n",
+        j6c35 => "--:2:-:-:1  \@P1 STS [writeS + 4x<11*128>], loadB3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 4x<16>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c46 => "10:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, 4x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 16, P6;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+        j0c29 => "--:-:2:-:1  \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+        j0c31 => "--:-:2:-:1  \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+        j0c33 => "--:-:2:-:1  \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j0c35 => "--:-:3:-:1  \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+        j1c27 => "--:-:3:-:1  \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+        j1c29 => "--:-:3:-:1  \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+        j1c31 => "--:-:3:-:1  \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+
+        j5c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x<0*128>], loadA0;\n",
+        j5c31 => "--:-:-:-:1  \@P0 STS [writeS + 4x<1*128>], loadA1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 STS [writeS + 4x<2*128>], loadA2;\n",
+        j5c35 => "--:-:-:-:1  \@P0 STS [writeS + 4x<3*128>], loadA3;\n",
+
+        j6c29 => "04:-:-:-:1  \@P0 STS [writeS + 4x< 8*128>], loadB0;\n",
+        j6c31 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 9*128>], loadB1;\n",
+        j6c33 => "--:-:-:-:1  \@P0 STS [writeS + 4x<10*128>], loadB2;\n",
+        j6c35 => "--:2:-:-:1  \@P0 STS [writeS + 4x<11*128>], loadB3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 4x<8>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c46 => "--:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, 4x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+            ),
+        ),
+
+        j6c63 => "02:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1      IADD32I k, k, -8;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_nt_32x128.sass b/Kernel/SGEMM/Maxwell/sgemm_nt_32x128.sass
new file mode 100644
index 0000000..339c825
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_nt_32x128.sass
@@ -0,0 +1,483 @@
+# Kernel: sgemm_nt_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(128*16 + 32)*2 + (32*16 + 32)*2>
+    szShareA : (32*16 + 32)
+    szShareB : (128*16 + 32)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ tidX, lda, ldb, ldaz, ldbz, ldb32, tid1, tid3, tid96, ta, tb00, tb32, tb64, tb96, xmad_ta, xmad_tb, shiftX, tidY<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadA<0-3>
+      84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>
+
+    100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    110-120 ~ writeAs, writeBs, k, tidY, txa, txb00, txb32, txb64, txb96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+--:-:3:-:1      S2R blkA, SR_CTAID.Y;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb32, ldb, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidX   = tid >> 2
+// tidY   = (tid & 3) << 2
+// shiftX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidX, tid,  2;
+01:-:-:-:1      LOP.AND tid3, tid,  3;
+--:-:-:-:1      SHL     tidY, tid3, 2;
+--:-:-:-:1      SHL     shiftX, tid3, 3;
+
+// trackA += ((blkA*32 + tidX) * lda + tidAY) * 4
+04:-:-:-:1      ISCADD   txa, blkA, tidX, 5;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 2;
+
+// trackB += ((blkB*128 + tidX) * ldb + tidY) * 4
+02:-:-:-:1      ISCADD txb00, blkB, tidX, 7;
+--:-:-:-:1      IADD   txb32, txb00, 32;
+--:-:-:-:1      IADD   txb64, txb00, 64;
+--:-:-:-:1      IADD   txb96, txb00, 96;
+
+--:-:-:-:1      XMAD.LO  tb00, ldb,  txb00, tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb00, ldbz, blkZ, tb00;
+--:-:-:-:1      IADD     tb32, tb00, ldb32;
+--:-:-:-:1      IADD     tb64, tb32, ldb32;
+--:-:-:-:1      IADD     tb96, tb64, ldb32;
+
+--:-:-:-:1      LEA      track0B0.CC, tb00, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track0B1,    tb00, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track1B0.CC, tb32, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track1B1,    tb32, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track2B0.CC, tb64, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track2B1,    tb64, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track3B0.CC, tb96, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track3B1,    tb96, param_B[1], RZ, 2;
+
+// writeAs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeBs, writeBs, shiftX;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+01:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readBs, readBs, tid96;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb64, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb96, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa,   param_m, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.CI.128 load0B, [track0B];
+--:-:2:-:1  @P3 LDG.E.CI.128 load1B, [track1B];
+--:-:3:-:1  @P4 LDG.E.CI.128 load2B, [track2B];
+--:-:4:-:1  @P5 LDG.E.CI.128 load3B, [track3B];
+--:-:5:-:1  @P6 LDG.E.CI.128 loadA,  [trackA];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.128 load0B, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load1B, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128 load2B, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 load3B, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.128 loadA,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidY1, tidY, 1;
+--:-:-:-:1      IADD tidY2, tidY, 2;
+--:-:-:-:1      IADD tidY3, tidY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI load0B0, [track0B + 4x<0>];
+--:-:1:-:1  @P1 LDG.E.CI load0B1, [track0B + 4x<1>];
+--:-:1:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
+--:-:1:-:1  @P3 LDG.E.CI load0B3, [track0B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI load1B0, [track1B + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI load1B1, [track1B + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI load1B2, [track1B + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb64, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI load2B0, [track2B + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI load2B1, [track2B + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load2B2, [track2B + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI load2B3, [track2B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb96, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI load3B0, [track3B + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI load3B1, [track3B + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI load3B2, [track3B + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load3B3, [track3B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:5:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:5:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb32, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P0, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P0;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS [writeBs + 4x<0*128 + 0*32>], load0B0;
+--:-:-:-:0      IADD   track0B0.CC, track0B0, 4x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 0*32>], load0B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 0*32>], load0B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 0*32>], load0B3;
+
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+02:-:-:-:1      STS [writeBs + 4x<0*128 + 1*32>], load1B0;
+--:-:-:-:0      IADD   track1B0.CC, track1B0, 4x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 1*32>], load1B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 1*32>], load1B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 1*32>], load1B3;
+
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+04:-:-:-:1      STS [writeBs + 4x<0*128 + 2*32>], load2B0;
+--:-:-:-:0      IADD   track2B0.CC, track2B0, 4x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 2*32>], load2B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 2*32>], load2B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 2*32>], load2B3;
+
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+08:-:-:-:1      STS [writeBs + 4x<0*128 + 3*32>], load3B0;
+--:-:-:-:0      IADD   track3B0.CC, track3B0, 4x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 3*32>], load3B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 3*32>], load3B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 3*32>], load3B3;
+
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+10:-:-:-:1      STS [writeAs + 4x<0*32>], loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*32>], loadA3;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.128 load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.128 load1B, [track1B];
+--:-:5:-:1  @P4 LDG.E.CI.128 load2B, [track2B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load3B, [track3B];
+--:-:6:-:1  @P6 LDG.E.CI.128 loadA,  [trackA];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI load0B0, [track0B + 4x<0>];
+--:-:3:-:1  @P2 LDG.E.CI load0B1, [track0B + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
+--:-:3:-:1  @P2 LDG.E.CI load0B3, [track0B + 4x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI load1B0, [track1B + 4x<0>];
+--:-:4:-:1  @P3 LDG.E.CI load1B1, [track1B + 4x<1>];
+--:-:4:-:1  @P3 LDG.E.CI load1B2, [track1B + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];
+
+--:-:5:-:1  @P4 LDG.E.CI load2B0, [track2B + 4x<0>];
+--:-:5:-:1  @P4 LDG.E.CI load2B1, [track2B + 4x<1>];
+--:-:5:-:1  @P4 LDG.E.CI load2B2, [track2B + 4x<2>];
+--:-:5:-:1  @P4 LDG.E.CI load2B3, [track2B + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load3B0, [track3B + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load3B1, [track3B + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load3B2, [track3B + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load3B3, [track3B + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:6:-:1  @P6 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:6:-:1  @P6 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:6:-:1  @P6 LDG.E.CI loadA3, [trackA + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 1;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 0*32>], load0B0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 0*32>], load0B1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 0*32>], load0B2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 0*32>], load0B3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 1*32>], load1B0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 1*32>], load1B1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 1*32>], load1B2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 1*32>], load1B3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 2*32>], load2B0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 2*32>], load2B1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 2*32>], load2B2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 2*32>], load2B3;\n",
+
+        j9c6   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 3*32>], load3B0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 3*32>], load3B1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 3*32>], load3B2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 3*32>], load3B3;\n",
+
+        j11c6  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<0*32>], loadA0;\n",
+        j11c8  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], loadA1;\n",
+        j11c10 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], loadA2;\n",
+        j11c12 => "--:6:-:-:1  \@P0 STS [writeAs + 4x<3*32>], loadA3;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0B0.CC, track0B0, 4x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0B1,    track0B1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, 4x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2B0.CC, track2B0, 4x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2B1,    track2B1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3B0.CC, track3B0, 4x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3B1,    track3B1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackA0.CC, trackA0, 4x<16>;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.128 load0B, [track0B];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.128 load1B, [track1B];\n",
+                j9c29  => "10:-:5:-:1  \@P4 LDG.E.CI.128 load2B, [track2B];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.128 load3B, [track3B];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.128 loadA,  [trackA];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI load0B0, [track0B + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI load0B1, [track0B + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI load0B2, [track0B + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI load0B3, [track0B + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI load1B0, [track1B + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI load1B1, [track1B + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI load1B2, [track1B + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI load1B3, [track1B + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI load2B0, [track2B + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P4 LDG.E.CI load2B1, [track2B + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P4 LDG.E.CI load2B2, [track2B + 4x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P4 LDG.E.CI load2B3, [track2B + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI load3B0, [track3B + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI load3B1, [track3B + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI load3B2, [track3B + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI load3B3, [track3B + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_common_32x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_rnn_bprop_common_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_rnn_bprop_common_128x32.sass
new file mode 100644
index 0000000..9f5919a
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_rnn_bprop_common_128x32.sass
@@ -0,0 +1,362 @@
+# sgemm_common_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+--:-:-:-:5      MOV xcutoff, param_xcutoff;
+
+--:-:-:-:6      LOP.AND.NZ   P0, RZ, flags, 4;
+--:-:-:-:6  @P0 IADD offsetC, -time_step, param_unrolling;
+--:-:-:-:6  @P0 IADD offsetC, offsetC,    -1;
+--:-:-:-:6 @!P0 MOV  offsetC, time_step;
+
+// baseH = param_H + dimH * time_step
+--:-:-:-:1      XMAD     offsetH,   offsetC,   param_dimH, RZ;
+--:-:-:-:1      LEA      baseH0.CC, offsetH,   param_H[0],     2;
+--:-:-:-:1      LEA.HI.X baseH1,    offsetH,   param_H[1], RZ, 2;
+
+// baseC = param_C + dimC * time_step
+--:-:-:-:1      XMAD     offsetC,   offsetC,   param_dimC, RZ;
+--:-:-:-:1      LEA      baseC0.CC, offsetC,   param_C[0],     2;
+--:-:-:-:1      LEA.HI.X baseC1,    offsetC,   param_C[1], RZ, 2;
+
+// writeCs = (readAs / 4) * 32 + readBs;
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 3;
+
+// readCs = ((tid & 96) << 2) | (tid & 31)   << 2;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 2;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid31;
+--:-:-:-:1      ISCADD cx, blkB, tid31, 5;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, RZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, baseC0,     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, baseC1, RZ, 2;
+
+// Apply relu
+--:-:-:-:0      LOP.AND.NZ   P4, RZ, flags, 2;
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+
+--:-:-:-:1      MOV  ldh1, param_ldh;
+
+// H += (ldh*cy + cx) * 4
+--:-:-:-:1      XMAD.LO  ci, ldh1,  cy00, cx, xmad_c;
+--:-:-:-:1      LEA      H00y0.CC, ci, baseH0,     2;
+--:-:-:-:1      LEA.HI.X H00y1,    ci, baseH1, RZ, 2;
+
+--:-:-:-:1      SHL  ldh1, ldh1, 2;
+--:-:-:-:1      SHL  ldh4, ldh1, 2;
+--:-:-:-:1      SHL  ldh60, ldh1, 6;
+--:-:-:-:1      IADD ldh60, ldh60, -ldh4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:4      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      MOV d0, RZ;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:4      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      MOV d1, RZ;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:3      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      MOV d2, RZ;
+--:-:-:-:1      MOV d3, RZ;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:1      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:6      IADD   H04y0.CC, H00y0, ldh4;
+--:-:-:-:1      IADD.X H04y1,    H00y1, RZ;
+--:-:-:-:6      IADD   H08y0.CC, H04y0, ldh4;
+--:-:-:-:1      IADD.X H08y1,    H04y1, RZ;
+--:-:-:-:6      IADD   H12y0.CC, H08y0, ldh4;
+--:-:-:-:0      IADD.X H12y1,    H08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n" .
+            "--:-:-:-:6      IADD   H00y0.CC, H00y0, ldh60;\n" .
+            "--:-:-:-:1      IADD.X H00y1,    H00y1, RZ;\n" .
+            "--:-:-:-:6      IADD   H04y0.CC, H04y0, ldh60;\n" .
+            "--:-:-:-:1      IADD.X H04y1,    H04y1, RZ;\n" .
+            "--:-:-:-:6      IADD   H08y0.CC, H08y0, ldh60;\n" .
+            "--:-:-:-:1      IADD.X H08y1,    H08y1, RZ;\n" .
+            "--:-:-:-:6      IADD   H12y0.CC, H12y0, ldh60;\n" .
+            "--:-:-:-:1      IADD.X H12y1,    H12y1, RZ;\n" if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:1      MOV lockAddr0, param_lockAddr[0];
+--:-:-:-:1      MOV lockAddr1, param_lockAddr[1];
+
+// time_step = time_step + 1
+--:-:-:-:6      IADD time_step, time_step, 1;
+--:-:-:-:1      ISETP.LT.AND P0, PT, time_step, param_unrolling, PT;
+
+// Synchronize all blocks
+--:-:-:-:1      ISETP.NE.AND P1, PT, tid, RZ, PT;
+--:-:-:-:6      XMAD blkId, blkB, param_numAblks, blkA;
+--:-:-:-:6      IADD nextBlk, blkId, 1;
+--:-:-:-:8      ISETP.EQ.OR P2, PT, nextBlk, param_numBlks, P1;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      SSY SSY_TARGET1;
+--:-:-:-:d  @P1 SYNC;
+--:-:-:-:6  @P2 MOV nextBlk, RZ;
+
+SPINLOCK1:
+--:-:1:Y:2      ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk;
+01:-:-:Y:d      ISETP.NE.AND P1, PT, lockVal, blkId, PT;
+--:-:-:-:d  @P1 BRA.U SPINLOCK1;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET1:
+--:-:-:-:1      SSY SSY_TARGET2;
+--:-:-:-:d  @P2 SYNC;
+--:-:-:-:6      MOV nextBlk, RZ;
+
+SPINLOCK2:
+--:-:1:Y:2      ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk;
+01:-:-:Y:d      ISETP.NE.AND P1, PT, lockVal, RZ, PT;
+--:-:-:-:5  @P1 BRA.U SPINLOCK2;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET2:
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:f      MEMBAR.GL;
+
+//Loop back to beginning of GEMM loop
+--:-:-:Y:5  @P0 BRA.U RNN_LOOP;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LDG.E h0, [H00y];
+--:-:-:-:1      LDG.E h1, [H04y];
+--:-:-:-:1      LDG.E h2, [H08y];
+--:-:-:-:1      LDG.E h3, [H12y];
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y];
+--:-:2:-:1  @P1 LDG.E d1, [C04y];
+--:-:3:-:1  @P2 LDG.E d2, [C08y];
+--:-:4:-:1  @P3 LDG.E d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:3      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*32>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*32>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*32>];
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1  P2R predSave, PR, RZ, 0x0f;
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, d3, beta, c3;
+
+//Bprop for activation: Rectlinclip
+<SCHEDULE_BLOCK>
+--:-:-:-:1  FSETP.LT.AND P0, PT, RZ, h0, PT;
+--:-:-:-:1  FSETP.LT.AND P1, PT, RZ, h1, PT;
+--:-:-:-:1  FSETP.LT.AND P2, PT, RZ, h2, PT;
+--:-:-:-:1  FSETP.LT.AND P3, PT, RZ, h3, PT;
+--:-:-:-:1  FSETP.LT.AND P0, PT, h0, xcutoff, P0;
+--:-:-:-:1  FSETP.LT.AND P1, PT, h1, xcutoff, P1;
+--:-:-:-:1  FSETP.LT.AND P2, PT, h2, xcutoff, P2;
+--:-:-:-:1  FSETP.LT.AND P3, PT, h3, xcutoff, P3;
+--:-:-:-:1  SEL c0, c0, RZ, P0;
+--:-:-:-:1  SEL c1, c1, RZ, P1;
+--:-:-:-:1  SEL c2, c2, RZ, P2;
+--:-:-:-:1  SEL c3, c3, RZ, P3;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:d  R2P PR, predSave, 0x0f;
+
+--:1:-:-:1  @P0 STG.E [C00y], c0;
+--:2:-:-:1  @P1 STG.E [C04y], c1;
+--:3:-:-:1  @P2 STG.E [C08y], c2;
+--:4:-:-:1  @P3 STG.E [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:6      IADD   H00y0.CC, H00y0, ldh1;
+--:-:-:-:1      IADD.X H00y1,    H00y1, RZ;
+--:-:-:-:6      IADD   H04y0.CC, H04y0, ldh1;
+--:-:-:-:1      IADD.X H04y1,    H04y1, RZ;
+--:-:-:-:6      IADD   H08y0.CC, H08y0, ldh1;
+--:-:-:-:1      IADD.X H08y1,    H08y1, RZ;
+--:-:-:-:6      IADD   H12y0.CC, H12y0, ldh1;
+--:-:-:-:0      IADD.X H12y1,    H12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/sgemm_rnn_common_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_rnn_common_128x32.sass
new file mode 100644
index 0000000..67bda6f
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_rnn_common_128x32.sass
@@ -0,0 +1,348 @@
+# sgemm_common_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+--:-:-:-:5      MOV xcutoff, param_xcutoff;
+
+--:-:-:-:6      LOP.AND.NZ   P0, RZ, flags, 4;
+--:-:-:-:6  @P0 IADD offsetC, -time_step, param_unrolling;
+--:-:-:-:6  @P0 IADD offsetC, offsetC,    -1;
+--:-:-:-:6 @!P0 MOV  offsetC, time_step;
+
+// baseC = param_C + dimC * time_step
+--:-:-:-:1      XMAD     offsetC,   offsetC,   param_dimC, RZ;
+--:-:-:-:1      LEA      baseC0.CC, offsetC,   param_C[0],     2;
+--:-:-:-:1      LEA.HI.X baseC1,    offsetC,   param_C[1], RZ, 2;
+
+// writeCs = (readAs / 4) * 32 + readBs;
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 3;
+
+// readCs = ((tid & 96) << 2) | (tid & 31)   << 2;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 2;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid31;
+--:-:-:-:1      ISCADD cx, blkB, tid31, 5;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, RZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, baseC0,     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, baseC1, RZ, 2;
+
+// Apply relu
+--:-:-:-:0      LOP.AND.NZ   P4, RZ, flags, 2;
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:4      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      MOV d0, RZ;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:4      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      MOV d1, RZ;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:3      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      MOV d2, RZ;
+--:-:-:-:1      MOV d3, RZ;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:1      IADD.X C12y1,    C08y1, RZ;
+
+<SCHEDULE_BLOCK>
+// bias_track = bias + cy
+--:-:-:-:1      LEA      bias00y0.CC, cy00, param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X bias00y1,    cy00, param_bias[1], RZ, 2;
+--:-:-:-:1      LEA      bias04y0.CC, cy04, param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X bias04y1,    cy04, param_bias[1], RZ, 2;
+--:-:-:-:1      LEA      bias08y0.CC, cy08, param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X bias08y1,    cy08, param_bias[1], RZ, 2;
+--:-:-:-:1      LEA      bias12y0.CC, cy12, param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X bias12y1,    cy12, param_bias[1], RZ, 2;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n" .
+            "--:-:-:-:6      IADD   bias00y0.CC, bias00y0, 240;\n" .
+            "--:-:-:-:1      IADD.X bias00y1, bias00y1, RZ;\n" .
+            "--:-:-:-:6      IADD   bias04y0.CC, bias04y0, 240;\n" .
+            "--:-:-:-:1      IADD.X bias04y1, bias04y1, RZ;\n" .
+            "--:-:-:-:6      IADD   bias08y0.CC, bias08y0, 240;\n" .
+            "--:-:-:-:1      IADD.X bias08y1, bias08y1, RZ;\n" .
+            "--:-:-:-:6      IADD   bias12y0.CC, bias12y0, 240;\n" .
+            "--:-:-:-:1      IADD.X bias12y1, bias12y1, RZ;\n" if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:1      MOV lockAddr0, param_lockAddr[0];
+--:-:-:-:1      MOV lockAddr1, param_lockAddr[1];
+
+// time_step = time_step + 1
+--:-:-:-:6      IADD time_step, time_step, 1;
+--:-:-:-:1      ISETP.LT.AND P0, PT, time_step, param_unrolling, PT;
+
+// Synchronize all blocks
+--:-:-:-:1      ISETP.NE.AND P1, PT, tid, RZ, PT;
+--:-:-:-:6      XMAD blkId, blkB, param_numAblks, blkA;
+--:-:-:-:6      IADD nextBlk, blkId, 1;
+--:-:-:-:8      ISETP.EQ.OR P2, PT, nextBlk, param_numBlks, P1;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      SSY SSY_TARGET1;
+--:-:-:-:d  @P1 SYNC;
+--:-:-:-:6  @P2 MOV nextBlk, RZ;
+
+SPINLOCK1:
+--:-:1:Y:2      ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk;
+01:-:-:Y:d      ISETP.NE.AND P1, PT, lockVal, blkId, PT;
+--:-:-:-:d  @P1 BRA.U SPINLOCK1;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET1:
+--:-:-:-:1      SSY SSY_TARGET2;
+--:-:-:-:d  @P2 SYNC;
+--:-:-:-:6      MOV nextBlk, RZ;
+
+SPINLOCK2:
+--:-:1:Y:2      ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk;
+01:-:-:Y:d      ISETP.NE.AND P1, PT, lockVal, RZ, PT;
+--:-:-:-:5  @P1 BRA.U SPINLOCK2;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET2:
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:f      MEMBAR.GL;
+
+//Loop back to beginning of GEMM loop
+--:-:-:Y:5  @P0 BRA.U RNN_LOOP;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LDG.E.CI b0, [bias00y];
+--:-:-:-:1      LDG.E.CI b1, [bias04y];
+--:-:-:-:1      LDG.E.CI b2, [bias08y];
+--:-:-:-:1      LDG.E.CI b3, [bias12y];
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y];
+--:-:2:-:1  @P1 LDG.E d1, [C04y];
+--:-:3:-:1  @P2 LDG.E d2, [C08y];
+--:-:4:-:1  @P3 LDG.E d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:3      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*32>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*32>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*32>];
+</SCHEDULE_BLOCK>
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, d3, beta, c3;
+
+--:-:-:-:1  FADD c0, c0, b0;
+--:-:-:-:1  FADD c1, c1, b1;
+--:-:-:-:1  FADD c2, c2, b2;
+--:-:-:-:3  FADD c3, c3, b3;
+
+//Activation function: Rectlinclip
+<SCHEDULE_BLOCK>
+--:-:-:-:1  FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  FMNMX c2, c2, RZ, !PT;
+--:-:-:-:3  FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1  FMNMX c0, c0, xcutoff, PT;
+--:-:-:-:1  FMNMX c1, c1, xcutoff, PT;
+--:-:-:-:1  FMNMX c2, c2, xcutoff, PT;
+--:-:-:-:3  FMNMX c3, c3, xcutoff, PT;
+</SCHEDULE_BLOCK>
+
+--:1:-:-:1  @P0 STG.E [C00y], c0;
+--:2:-:-:1  @P1 STG.E [C04y], c1;
+--:3:-:-:1  @P2 STG.E [C08y], c2;
+--:4:-:-:1  @P3 STG.E [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:6      IADD   bias00y0.CC, bias00y0, 4;
+--:-:-:-:1      IADD.X bias00y1,    bias00y1, RZ;
+--:-:-:-:6      IADD   bias04y0.CC, bias04y0, 4;
+--:-:-:-:1      IADD.X bias04y1,    bias04y1, RZ;
+--:-:-:-:6      IADD   bias08y0.CC, bias08y0, 4;
+--:-:-:-:1      IADD.X bias08y1,    bias08y1, RZ;
+--:-:-:-:6      IADD   bias12y0.CC, bias12y0, 4;
+--:-:-:-:0      IADD.X bias12y1,    bias12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Maxwell/sgemm_tn_128x128.sass b/Kernel/SGEMM/Maxwell/sgemm_tn_128x128.sass
new file mode 100644
index 0000000..5099001
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_tn_128x128.sass
@@ -0,0 +1,279 @@
+# Kernel: sgemm_tn_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, tid1, tid7, tidX, blk, tid31, tid128
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-95   ~ x<1-3>, y<1-3>
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-103  : loadA<0-3>, loadB<0-3>
+
+    104-107 : trackA<0-1>, trackB<0-1>
+
+    108-121 ~ writeS, lda8, k, tidY, txa, txb, ta, tb, loop
+    122-127 ~ readAs, readBs, tid
+
+    64-75   ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-121  ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k, param_k;
+--:-:-:-:1      MOV loop, RZ;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+
+// tidX = (tid & 31) << 2
+// tidY = (tid >> 5) & 7
+01:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      SHL     tidX,   tid31, 2;
+--:-:-:-:1      BFE.U32 tidY,   tid,  0x305; // 3 bits at position 5
+
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 5;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// trackA += (blkA*128 + lda*tidY + tidX) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// trackB += (blkB*128 + ldb*tidY + tidX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 7;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:1      LOP.AND.NZ P4, RZ, k, 7;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 8, P4;
+
+// doLoad = tidY < k && txa|txb < n|m
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P6;
+
+--:-:2:-:1  @P2 LDG.E.CI.128 loadA, [trackA];
+--:-:3:-:1  @P3 LDG.E.CI.128 loadB, [trackB];
+
+--:-:5:-:1 @!P2 LDS.U.128 loadA, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 loadB, [addr_zero];
+
+    // Vec 4 and scalar loads
+    } : q{
+
+// doLoadA = tidY < k && txa < m
+// doLoadB = tidY < k && txb < n
+--:-:-:-:1      IADD x1, txa, 1;
+--:-:-:-:1      IADD x2, txa, 2;
+--:-:-:-:1      IADD x3, txa, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_m, P0;
+
+--:-:2:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      IADD y1, txb, 1;
+--:-:-:-:1      IADD y2, txb, 2;
+--:-:-:-:1      IADD y3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_n, P0;
+
+--:-:3:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 8, PT;
+    };
+</CODE>
+
+</SCHEDULE_BLOCK>
+
+12:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadA0;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+24:-:-:-:1      STS.128 [writeS + 4x<8*128>], loadB0;
+
+--:-:-:-:1      IADD   trackB0.CC, trackB0, param_ldb8;
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n");
+
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n",
+        j0c8  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        ($vec ?
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI.128 loadA, [trackA];\n",
+        j0c13 => "--:-:3:-:1  \@P3 LDG.E.CI.128 loadB, [trackB];\n",
+            ) :
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+        j0c29 => "--:-:2:-:1  \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+        j0c31 => "--:-:2:-:1  \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+        j0c33 => "--:-:2:-:1  \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j0c35 => "--:-:3:-:1  \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+        j1c29 => "--:-:3:-:1  \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+        j1c31 => "--:-:3:-:1  \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+        j1c33 => "--:-:3:-:1  \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j5c33 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*128>], loadA0;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c33 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<8*128>], loadB0;\n",
+
+        j6c46 => "--:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_tn_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_tn_128x32.sass
new file mode 100644
index 0000000..0b9ffc1
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_tn_128x32.sass
@@ -0,0 +1,447 @@
+# Kernel: sgemm_tn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*16*2 + 32*16*2>
+    szShareA  : 128*16
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 5;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL lda16, lda, 6;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+--:-:-:-:1      SHL lda4,  lda, 2;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX = (tid & 31) << 2
+// tidAY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidAX, tid,   31;
+--:-:-:-:1      SHL     tidAX, tidAX, 2;
+--:-:-:-:1      SHR.U32 tidAY, tid,   5;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += (blkA*128 + tidAX + lda*tidAY) * 4
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO2 ta0, lda,  tidAY, txa;
+08:-:-:-:1      XMAD.LO2 ta0, ldaz, blkZ,  ta0;
+--:-:-:-:1      IADD     ta1, ta0, lda4;
+--:-:-:-:1      IADD     ta2, ta1, lda4;
+--:-:-:-:1      IADD     ta3, ta2, lda4;
+
+--:-:-:-:1      LEA      track0A0.CC, ta0, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta0, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, ta1, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track1A1,    ta1, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track2A0.CC, ta2, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track2A1,    ta2, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track3A0.CC, ta3, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track3A1,    ta3, param_A[1], RZ, 2;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidBX,  5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 2;
+
+// writeAs = (tidAY*128 + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidAY1, tidAY, 4;
+--:-:-:-:1      IADD tidAY2, tidAY, 8;
+--:-:-:-:1      IADD tidAY3, tidAY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.128 load0A, [track0A];
+--:-:2:-:1  @P1 LDG.E.CI.128 load1A, [track1A];
+--:-:3:-:1  @P2 LDG.E.CI.128 load2A, [track2A];
+--:-:4:-:1  @P3 LDG.E.CI.128 load3A, [track3A];
+--:-:5:-:1  @P4 LDG.E.CI.128 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.128 load0A, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 load1A, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.128 load2A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load3A, [addr_zero];
+--:-:6:-:2 @!P4 LDS.U.128 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD txa1,  txa,  1;
+--:-:-:-:1      IADD txa2,  txa,  2;
+--:-:-:-:1      IADD txa3,  txa,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI load0A0, [track0A + 4x<0>];
+--:-:1:-:1  @P1 LDG.E.CI load0A1, [track0A + 4x<1>];
+--:-:1:-:1  @P2 LDG.E.CI load0A2, [track0A + 4x<2>];
+--:-:1:-:1  @P3 LDG.E.CI load0A3, [track0A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI load1A0, [track1A + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI load1A1, [track1A + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI load1A2, [track1A + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI load1A3, [track1A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI load2A0, [track2A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI load2A1, [track2A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load2A2, [track2A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI load2A3, [track2A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI load3A0, [track3A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI load3A1, [track3A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI load3A2, [track3A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load3A3, [track3A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:5:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:5:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 15;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS.128 [writeAs + 4x<0*128>], load0A;
+--:-:-:-:6      IADD   track0A0.CC, track0A0, lda16;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<4*128>], load1A;
+--:-:-:-:6      IADD   track1A0.CC, track1A0, lda16;
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS.128 [writeAs + 4x<8*128>], load2A;
+--:-:-:-:6      IADD   track2A0.CC, track2A0, lda16;
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS.128 [writeAs + 4x<12*128>], load3A;
+--:-:-:-:6      IADD   track3A0.CC, track3A0, lda16;
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1,    trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P5 LDG.E.CI.128 load0A, [track0A];
+--:-:4:-:1  @P5 LDG.E.CI.128 load1A, [track1A];
+--:-:5:-:1  @P5 LDG.E.CI.128 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.CI.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.128 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P5 LDG.E.CI load0A0, [track0A + 4x<0>];
+--:-:3:-:1  @P5 LDG.E.CI load0A1, [track0A + 4x<1>];
+--:-:3:-:1  @P5 LDG.E.CI load0A2, [track0A + 4x<2>];
+--:-:3:-:1  @P5 LDG.E.CI load0A3, [track0A + 4x<3>];
+
+--:-:4:-:1  @P5 LDG.E.CI load1A0, [track1A + 4x<0>];
+--:-:4:-:1  @P5 LDG.E.CI load1A1, [track1A + 4x<1>];
+--:-:4:-:1  @P5 LDG.E.CI load1A2, [track1A + 4x<2>];
+--:-:4:-:1  @P5 LDG.E.CI load1A3, [track1A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load2A0, [track2A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load2A1, [track2A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load2A2, [track2A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load2A3, [track2A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load3A0, [track3A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load3A1, [track3A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load3A2, [track3A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load3A3, [track3A + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P6 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P6 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P6 LDG.E.CI loadB3, [trackB + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 0;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n",
+        j9c6   => "--:5:-:-:1  \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n",
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, lda16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, lda16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2A0.CC, track2A0, lda16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, lda16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.128 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.128 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.128 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.128 loadB,  [trackB];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI load2A0, [track2A + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E.CI load2A1, [track2A + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E.CI load2A2, [track2A + 4x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E.CI load2A3, [track2A + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_tn_128x64.sass b/Kernel/SGEMM/Maxwell/sgemm_tn_128x64.sass
new file mode 100644
index 0000000..74f13cc
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_tn_128x64.sass
@@ -0,0 +1,326 @@
+# Kernel: sgemm_tn_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ lda, ldb, ldaz, ldbz, tid1, ta, tb, tid7, tid15, tidX, blk, txa64, xmad_tb, tid, blkA, blkB, blkZ
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-95   ~ x<1-3>, x<65-67>, y<1-3>
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+     96-107 : loadA<0-7>,  loadB<0-3>
+    108-111 : trackA<0-1>, trackB<0-1>
+
+    112-125 ~ writeAs, writeBs, k, tidY, txa, txb, swapBuf
+    126-127 ~ readAs, readBs
+
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-125  ~ tid_2, blockA, blockB, blockZ, ldc, ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, ci, xmad_c, alpha, beta, flags, tid31, tid96
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k, param_k;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      LOP.AND tid1,  tid, 1;
+01:-:-:-:1      LOP.AND tid15, tid, 15;
+
+// tidX = (tid & 15) << 2
+// tidY = (tid >> 4) & 7
+--:-:-:-:1      SHL     tidX, tid15, 2;
+--:-:-:-:1      BFE.U32 tidY, tid,   0x304; // 3 bits at position 4
+
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 5;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+
+// trackA += (blkA*128 + lda*tidY + tidX + ldaz*blkZ) * 4
+02:-:-:-:1      ISCADD   txa, blkA, tidX,  7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x2;
+
+--:-:-:-:1      IADD txa64, txa, 64;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa,   param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa64, param_m, PT;
+
+// trackB += (blkB*64 + tidX + ldb*tidY + ldbz*blkZ) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 6;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// Start the write buffers high
+// writeAs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2;
+// writeBs = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2;
+
+// Start the read buffers low
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<64*8 + 128*8>;
+</SCHEDULE_BLOCK>
+
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+
+// doLoad = tidY < k && txa|txb < n|m
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P6;
+
+<ORDERED>
+--:-:2:-:1  @P1 LDG.E.CI.128 loadA0, [trackA + 4x< 0>];
+--:-:3:-:1  @P2 LDG.E.CI.128 loadA4, [trackA + 4x<64>];
+--:-:4:-:1  @P3 LDG.E.CI.128 loadB0, [trackB];
+
+--:-:5:-:2 @!P1 LDS.U.128 loadA0, [addr_zero];
+--:-:5:-:2 @!P2 LDS.U.128 loadA4, [addr_zero];
+--:-:6:-:2 @!P3 LDS.U.128 loadB0, [addr_zero];
+</ORDERED>
+
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:1      LOP.AND.NZ P1, RZ, k, 7;
+
+    // Vec 4 and scalar loads
+    } : q{
+
+// doLoadA = tidY < k && txa < m
+// doLoadB = tidY < k && txb < n
+--:-:-:-:1      IADD x1,  txa, 1;
+--:-:-:-:1      IADD x2,  txa, 2;
+--:-:-:-:1      IADD x3,  txa, 3;
+--:-:-:-:1      IADD x65, txa, 65;
+--:-:-:-:1      IADD x66, txa, 66;
+--:-:-:-:1      IADD x67, txa, 67;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_m, P0;
+
+--:-:2:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x65, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x66, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x67, param_m, P0;
+
+--:-:3:-:1  @P0 LDG.E.CI loadA4, [trackA + 4x<64>];
+--:-:3:-:1  @P1 LDG.E.CI loadA5, [trackA + 4x<65>];
+--:-:3:-:1  @P2 LDG.E.CI loadA6, [trackA + 4x<66>];
+--:-:3:-:1  @P3 LDG.E.CI loadA7, [trackA + 4x<67>];
+
+--:-:-:-:1 @!P0 MOV loadA4, RZ;
+--:-:-:-:1 @!P1 MOV loadA5, RZ;
+--:-:-:-:1 @!P2 MOV loadA6, RZ;
+--:-:-:-:1 @!P3 MOV loadA7, RZ;
+
+--:-:-:-:1      IADD y1, txb, 1;
+--:-:-:-:1      IADD y2, txb, 2;
+--:-:-:-:1      IADD y3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_n, P0;
+
+--:-:4:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+    };
+</CODE>
+
+</SCHEDULE_BLOCK>
+
+12:-:-:-:1      STS.128 [writeAs + 4x< 0>], loadA0;
+04:-:-:-:1      STS.128 [writeAs + 4x<64>], loadA4;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+28:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:1      IADD.X trackB1,    trackB1, RZ;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+    } : q{
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P4;\n");
+
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P5;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, P6;\n",
+
+        ($vec ?
+            (
+        j0c13 => "--:-:2:-:1  \@P2 LDG.E.CI.128 loadA0, [trackA + 4x< 0>];\n",
+        j0c15 => "--:-:3:-:1  \@P3 LDG.E.CI.128 loadA4, [trackA + 4x<64>];\n",
+        j0c33 => "--:-:4:-:1  \@P0 LDG.E.CI.128 loadB0, [trackB];\n",
+            ) :
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j0c29 => "--:-:3:-:1  \@P3 LDG.E.CI loadA4, [trackA + 4x<64>];\n",
+        j0c31 => "--:-:3:-:1  \@P3 LDG.E.CI loadA5, [trackA + 4x<65>];\n",
+        j0c33 => "--:-:3:-:1  \@P3 LDG.E.CI loadA6, [trackA + 4x<66>];\n",
+        j0c35 => "--:-:3:-:1  \@P3 LDG.E.CI loadA7, [trackA + 4x<67>];\n",
+
+        j1c29 => "--:-:4:-:1  \@P0 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+        j1c31 => "--:-:4:-:1  \@P0 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+        j1c33 => "--:-:4:-:1  \@P0 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+        j1c35 => "--:-:4:-:1  \@P0 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j1c37 => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        j1c39 => "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j5c31 => "02:-:-:-:1  \@P0 STS.128 [writeAs + 4x< 0>], loadA0;\n",
+        j5c33 => "04:-:-:-:1  \@P0 STS.128 [writeAs + 4x<64>], loadA4;\n",
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c39 => "08:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c46 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j6c54 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x64.sass"/>
diff --git a/Kernel/SGEMM/Maxwell/sgemm_tn_rnn_bprop_128x32.sass b/Kernel/SGEMM/Maxwell/sgemm_tn_rnn_bprop_128x32.sass
new file mode 100644
index 0000000..3db4612
--- /dev/null
+++ b/Kernel/SGEMM/Maxwell/sgemm_tn_rnn_bprop_128x32.sass
@@ -0,0 +1,476 @@
+# Kernel: sgemm_tn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*16*2 + 32*16*2>
+    szShareA  : 128*16
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]        : c[0x0][0x140]
+    param_C[1]        : c[0x0][0x144]
+    param_A[0]        : c[0x0][0x148]
+    param_A[1]        : c[0x0][0x14c]
+    param_B[0]        : c[0x0][0x150]
+    param_B[1]        : c[0x0][0x154]
+    param_H[0]        : c[0x0][0x158]
+    param_H[1]        : c[0x0][0x15c]
+    param_lockAddr[0] : c[0x0][0x160]
+    param_lockAddr[1] : c[0x0][0x164]
+    param_alpha       : c[0x0][0x168]
+    param_beta        : c[0x0][0x16c]
+    param_xcutoff     : c[0x0][0x170]
+    param_flags       : c[0x0][0x174]
+    param_lda8        : c[0x0][0x178]
+    param_ldb8        : c[0x0][0x17c]
+    param_ldc         : c[0x0][0x180]
+    param_ldh         : c[0x0][0x184]
+    param_m           : c[0x0][0x188]
+    param_n           : c[0x0][0x18c]
+    param_k           : c[0x0][0x190]
+    param_ldaz        : c[0x0][0x194]
+    param_ldbz        : c[0x0][0x198]
+    param_ldcz        : c[0x0][0x19c]
+    param_loops       : c[0x0][0x1a0]
+    param_dimB        : c[0x0][0x1a4]
+    param_dimC        : c[0x0][0x1a8]
+    param_dimH        : c[0x0][0x1ac]
+    param_unrolling   : c[0x0][0x1b0]
+    param_numBlks     : c[0x0][0x1b4]
+    param_numAblks    : c[0x0][0x1b8]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>, offsetB
+    80-81 : baseB<0-1>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, time_step
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+    48-63 : H00y<0-1>, H04y<0-1>, H08y<0-1>, H12y<0-1>, h0, h1, h2, h3, baseC<0-1>, baseH<0-1>
+    64-68 : blkId, nextBlk, lockAddr<0-1>, lockVal
+   69-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags, xcutoff, offsetC, offsetH, numBlk, predSave, ldh1, ldh4, ldh60
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+
+--:-:-:-:1      MOV time_step, RZ;
+--:-:-:-:1      MOV flags, param_flags;
+
+RNN_LOOP:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 5;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL lda16, lda, 6;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+--:-:-:-:1      SHL lda4,  lda, 2;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+--:-:-:-:6      LOP.AND.NZ   P0, RZ, flags, 4;
+--:-:-:-:6  @P0 IADD offsetB, -time_step, param_unrolling;
+--:-:-:-:6  @P0 IADD offsetB, offsetB,    -1;
+--:-:-:-:6 @!P0 MOV  offsetB, time_step;
+
+// baseB = param_B + dimB * time_step
+--:-:-:-:1      XMAD     offsetB,   offsetB,   param_dimB, RZ;
+--:-:-:-:1      LEA      baseB0.CC, offsetB,   param_B[0],     2;
+--:-:-:-:1      LEA.HI.X baseB1,    offsetB,   param_B[1], RZ, 2;
+
+// tidAX = (tid & 31) << 2
+// tidAY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidAX, tid,   31;
+--:-:-:-:1      SHL     tidAX, tidAX, 2;
+--:-:-:-:1      SHR.U32 tidAY, tid,   5;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += (blkA*128 + tidAX + lda*tidAY) * 4
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO2 ta0, lda,  tidAY, txa;
+08:-:-:-:1      XMAD.LO2 ta0, ldaz, RZ,    ta0;
+--:-:-:-:1      IADD     ta1, ta0, lda4;
+--:-:-:-:1      IADD     ta2, ta1, lda4;
+--:-:-:-:1      IADD     ta3, ta2, lda4;
+
+--:-:-:-:1      LEA      track0A0.CC, ta0, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta0, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, ta1, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track1A1,    ta1, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track2A0.CC, ta2, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track2A1,    ta2, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track3A0.CC, ta3, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track3A1,    ta3, param_A[1], RZ, 2;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidBX,  5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, RZ,    tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, baseB0,     2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, baseB1, RZ, 2;
+
+// writeAs = (tidAY*128 + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidAY1, tidAY, 4;
+--:-:-:-:1      IADD tidAY2, tidAY, 8;
+--:-:-:-:1      IADD tidAY3, tidAY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.128 load0A, [track0A];
+--:-:2:-:1  @P1 LDG.E.128 load1A, [track1A];
+--:-:3:-:1  @P2 LDG.E.128 load2A, [track2A];
+--:-:4:-:1  @P3 LDG.E.128 load3A, [track3A];
+--:-:5:-:1  @P4 LDG.E.128 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.128 load0A, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 load1A, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.128 load2A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load3A, [addr_zero];
+--:-:6:-:2 @!P4 LDS.U.128 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD txa1,  txa,  1;
+--:-:-:-:1      IADD txa2,  txa,  2;
+--:-:-:-:1      IADD txa3,  txa,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E load0A0, [track0A + 4x<0>];
+--:-:1:-:1  @P1 LDG.E load0A1, [track0A + 4x<1>];
+--:-:1:-:1  @P2 LDG.E load0A2, [track0A + 4x<2>];
+--:-:1:-:1  @P3 LDG.E load0A3, [track0A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E load1A0, [track1A + 4x<0>];
+--:-:2:-:1  @P1 LDG.E load1A1, [track1A + 4x<1>];
+--:-:2:-:1  @P2 LDG.E load1A2, [track1A + 4x<2>];
+--:-:2:-:1  @P3 LDG.E load1A3, [track1A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E load2A0, [track2A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E load2A1, [track2A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E load2A2, [track2A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E load2A3, [track2A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E load3A0, [track3A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E load3A1, [track3A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E load3A2, [track3A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E load3A3, [track3A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E loadB0, [trackB + 4x<0>];
+--:-:5:-:1  @P1 LDG.E loadB1, [trackB + 4x<1>];
+--:-:5:-:1  @P2 LDG.E loadB2, [trackB + 4x<2>];
+--:-:5:-:1  @P3 LDG.E loadB3, [trackB + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 15;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS.128 [writeAs + 4x<0*128>], load0A;
+--:-:-:-:6      IADD   track0A0.CC, track0A0, lda16;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<4*128>], load1A;
+--:-:-:-:6      IADD   track1A0.CC, track1A0, lda16;
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS.128 [writeAs + 4x<8*128>], load2A;
+--:-:-:-:6      IADD   track2A0.CC, track2A0, lda16;
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS.128 [writeAs + 4x<12*128>], load3A;
+--:-:-:-:6      IADD   track3A0.CC, track3A0, lda16;
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1,    trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P5 LDG.E.128 load0A, [track0A];
+--:-:4:-:1  @P5 LDG.E.128 load1A, [track1A];
+--:-:5:-:1  @P5 LDG.E.128 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.128 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P5 LDG.E load0A0, [track0A + 4x<0>];
+--:-:3:-:1  @P5 LDG.E load0A1, [track0A + 4x<1>];
+--:-:3:-:1  @P5 LDG.E load0A2, [track0A + 4x<2>];
+--:-:3:-:1  @P5 LDG.E load0A3, [track0A + 4x<3>];
+
+--:-:4:-:1  @P5 LDG.E load1A0, [track1A + 4x<0>];
+--:-:4:-:1  @P5 LDG.E load1A1, [track1A + 4x<1>];
+--:-:4:-:1  @P5 LDG.E load1A2, [track1A + 4x<2>];
+--:-:4:-:1  @P5 LDG.E load1A3, [track1A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E load2A0, [track2A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E load2A1, [track2A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E load2A2, [track2A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E load2A3, [track2A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E load3A0, [track3A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E load3A1, [track3A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E load3A2, [track3A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E load3A3, [track3A + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P6 LDG.E loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P6 LDG.E loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P6 LDG.E loadB3, [trackB + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 0;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n",
+        j9c6   => "--:5:-:-:1  \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n",
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, lda16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, lda16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2A0.CC, track2A0, lda16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, lda16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.128 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.128 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.128 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.128 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.128 loadB,  [trackB];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E load0A0, [track0A + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E load0A1, [track0A + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E load0A2, [track0A + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E load0A3, [track0A + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E load1A0, [track1A + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E load1A1, [track1A + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E load1A2, [track1A + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E load1A3, [track1A + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E load2A0, [track2A + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E load2A1, [track2A + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E load2A2, [track2A + 4x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E load2A3, [track2A + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E load3A0, [track3A + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E load3A1, [track3A + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E load3A2, [track3A + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E load3A3, [track3A + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E loadB0, [trackB + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E loadB1, [trackB + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E loadB2, [trackB + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_rnn_bprop_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Pascal/hgemm_common_128x128.sass b/Kernel/SGEMM/Pascal/hgemm_common_128x128.sass
new file mode 100644
index 0000000..d699483
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_common_128x128.sass
@@ -0,0 +1,412 @@
+# hgemm_common_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+[-
+
+our $int16;
+
+sub convert_in {
+    return $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+}
+
+
+sub convert_out {
+    return $int16 ? 'F2I.S16.F32': 'F2F.F16.F32';
+}
+
+
+sub scale_int16 {
+    return $int16? q{
+--:-:-:-:1      FMUL c0, c0, param_scale;
+--:-:-:-:1      FMUL c1, c1, param_scale;
+--:-:-:-:1      FMUL c2, c2, param_scale;
+--:-:-:-:0      FMUL c3, c3, param_scale;
+    } : "";
+}
+
+
+sub max_abs1 {
+    return $int16? q{
+--:-:-:-:1 @!P0 MOV cs0, RZ;
+--:-:-:-:1 @!P1 MOV cs1, RZ;
+--:-:-:-:1 @!P2 MOV cs2, RZ;
+--:-:-:-:1 @!P3 MOV cs3, RZ;
+
+--:-:-:-:1  @P0 VABSDIFF.S16.S16.MRG_16L cs0, c0, RZ, RZ;
+--:-:-:-:1  @P1 VABSDIFF.S16.S16.MRG_16L cs1, c1, RZ, RZ;
+--:-:-:-:1  @P2 VABSDIFF.S16.S16.MRG_16L cs2, c2, RZ, RZ;
+--:-:-:-:1  @P3 VABSDIFF.S16.S16.MRG_16L cs3, c3, RZ, RZ;
+    } : "";
+}
+
+
+sub max_abs2 {
+    return $int16? q{
+<SCHEDULE_BLOCK>
+
+// a = abs(a)
+--:-:-:-:1  @P0 VABSDIFF.S16.S16.MRG_16H cs0, c0, RZ, cs0;
+--:-:-:-:1  @P1 VABSDIFF.S16.S16.MRG_16H cs1, c1, RZ, cs1;
+--:-:-:-:1  @P2 VABSDIFF.S16.S16.MRG_16H cs2, c2, RZ, cs2;
+--:-:-:-:1  @P3 VABSDIFF.S16.S16.MRG_16H cs3, c3, RZ, cs3;
+
+// max = max(c,d,max(a,b,max)) ...
+--:-:-:-:1      VMNMX.UD.U16.U16.MX.MAX maxabs, cs0, cs0.H1, maxabs;
+--:-:-:-:1      VMNMX.UD.U16.U16.MX.MAX maxabs, cs1, cs1.H1, maxabs;
+--:-:-:-:1      VMNMX.UD.U16.U16.MX.MAX maxabs, cs2, cs2.H1, maxabs;
+--:-:-:-:1      VMNMX.UD.U16.U16.MX.MAX maxabs, cs3, cs3.H1, maxabs;
+</SCHEDULE_BLOCK>
+
+    } : "";
+}
+
+
+sub butterfly {
+    return $int16 ? q{
+--:-:-:-:0      LOP.AND.Z P0, RZ, tid, 31;
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x10, 0x1f;
+01:-:-:-:4      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x8,  0x1f;
+01:-:-:-:4      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x4,  0x1f;
+01:-:-:-:4      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:-:-:0      MOV Stats0, param_Stats[0];
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x2,  0x1f;
+01:-:-:-:4      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:-:-:0      MOV Stats1, param_Stats[1];
+--:-:1:-:2      SHFL.BFLY PT, warp_max, maxabs, 0x1,  0x1f;
+01:-:-:-:2      IMNMX maxabs, warp_max, maxabs, !PT;
+--:-:-:-:1  @P0 RED.E.MAX [Stats], maxabs;
+    } : "";
+}
+
+-]
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>];
+
+LOOP:
+
+[+
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $yield  = $c == 32 ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
++]
+
+--:-:-:-:1      IADD loop, loop, 1;
+--:-:-:-:1      IADD ta, ta, param_ldaz;
+--:-:-:-:1      IADD tb, tb, param_ldbz;
+--:-:-:-:3      MOV  k, param_k;
+--:-:-:-:1      ISETP.LT.AND P1, PT, loop, param_loops, PT;
+--:-:-:-:6      LEA      trackA0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 1;
+--:-:-:-:6      LEA      trackB0.CC, tb, param_B[0],     1;
+--:-:-:-:0      LEA.HI.X trackB1,    tb, param_B[1], RZ, 1;
+--:-:-:Y:5  @P1 BRA.U REMAINDER;
+
+<SCHEDULE_BLOCK>
+
+// writeCs = (readAs / 4) * 128 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;
+
+--:-:-:-:1      LOP.AND tid_31,  tid, 31;
+--:-:-:-:1      LOP.AND tid_96,  tid, 96;
+--:-:-:-:1      LOP.AND tid_128, tid, 128;
+
+// cx = tid31 | (tid_128 >> 2);
+--:-:-:-:1      SHR.U32  cx00, tid_128, 2;
+--:-:-:-:1      LOP.OR   cx00, tid_31,  cx00;
+
+// readCs = ((tid_96 << 4) | cx) << 2;
+--:-:-:-:1      SHL      readCs, tid_96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, cx00;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// cx += blkB*128;
+--:-:-:-:1      ISCADD  cx00, blkB, cx00, 7;
+--:-:-:-:1      IADD    cx64, cx00, 64;
+
+// cy = blkA*128 + (tid_96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid_96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc, cx00, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 1;
+
+--:-:-:-:1      SHL  ldc1, ldc, 1;
+--:-:-:-:1      SHL  ldc4, ldc, 3;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 7;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+--:-:-:-:1      MOV maxabs, RZ;
+
+--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+[+ butterfly() +]
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.S16 d0, [C00y0 + 2x<00>];
+--:-:2:-:1  @P1 LDG.E.S16 d1, [C00y0 + 2x<64>];
+--:-:3:-:1  @P2 LDG.E.S16 d2, [C04y0 + 2x<00>];
+--:-:4:-:1  @P3 LDG.E.S16 d3, [C04y0 + 2x<64>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c4, c4, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c5, c5, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c6, c6, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c7, c7, RZ, !PT;
+
+--:-:-:-:5      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+</SCHEDULE_BLOCK>
+
+--:-:-:-:3      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], c4;
+
+--:-:-:-:0      IADD cy00, cy00, 1;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<0*128 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<0*128 + 64>];
+--:-:-:-:1      LDS c2, [readCs + 4x<1*128 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<1*128 + 64>];
+
+--:-:-:-:0      IADD cy04, cy04, 1;
+
+01:-:1:-:1  @P6 [+ convert_in() +] d0, d0;
+02:-:2:-:1  @P6 [+ convert_in() +] d1, d1;
+04:-:3:-:1  @P6 [+ convert_in() +] d2, d2;
+08:-:4:-:1  @P6 [+ convert_in() +] d3, d3;
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:3  @P6 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+01:-:-:-:1  @P0 STG.E.S16 [C00y0 + 2x<00>], c0;
+02:5:-:-:1  @P1 STG.E.S16 [C00y0 + 2x<64>], c1;
+04:-:-:-:1  @P2 STG.E.S16 [C04y0 + 2x<00>], c2;
+08:6:-:-:1  @P3 STG.E.S16 [C04y0 + 2x<64>], c3;
+
+[+ max_abs1() +]
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.S16 d0, [C08y0 + 2x<00>];
+--:-:2:-:1  @P1 LDG.E.S16 d1, [C08y0 + 2x<64>];
+--:-:3:-:1  @P2 LDG.E.S16 d2, [C12y0 + 2x<00>];
+--:-:4:-:1  @P3 LDG.E.S16 d3, [C12y0 + 2x<64>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+--:-:-:-:2      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:2      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+</SCHEDULE_BLOCK>
+
+10:-:-:-:4      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD   cy08, cy08, 1;
+--:-:-:-:1      IADD   cy12, cy12, 1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+20:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:0      IADD.X C04y1,    C04y1, RZ;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<2*128 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<2*128 + 64>];
+--:-:-:-:1      LDS c2, [readCs + 4x<3*128 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*128 + 64>];
+
+01:-:1:-:4  @P6 [+ convert_in() +] d0, d0;
+02:-:2:-:4  @P6 [+ convert_in() +] d1, d1;
+04:-:3:-:4  @P6 [+ convert_in() +] d2, d2;
+08:-:4:-:1  @P6 [+ convert_in() +] d3, d3;
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:3  @P6 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+01:-:-:-:1  @P0 STG.E.S16 [C08y0 + 2x<00>], c0;
+02:5:-:-:1  @P1 STG.E.S16 [C08y0 + 2x<64>], c1;
+04:-:-:-:1  @P2 STG.E.S16 [C12y0 + 2x<00>], c2;
+08:6:-:-:1  @P3 STG.E.S16 [C12y0 + 2x<64>], c3;
+
+[+ max_abs2() +]
+
+10:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+20:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/hgemm_common_128x32.sass b/Kernel/SGEMM/Pascal/hgemm_common_128x32.sass
new file mode 100644
index 0000000..9d4860a
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_common_128x32.sass
@@ -0,0 +1,246 @@
+# hgemm_common_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+    
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 32 + readBs;
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 3;
+
+// readCs = ((tid & 96) << 2) | (tid & 31)   << 2;
+--:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      LOP.AND tid96,  tid,  96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 2;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid31;
+--:-:-:-:1      ISCADD cx, blkB, tid31, 5;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc,    cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 1;
+
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P4, RZ, flags, 2;
+
+--:-:-:-:1      SHL  ldc1, ldc, 1;
+--:-:-:-:1      SHL  ldc4, ldc, 3;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 7;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.U16 d0, [C00y];
+--:-:2:-:1  @P1 LDG.E.U16 d1, [C04y];
+--:-:3:-:1  @P2 LDG.E.U16 d2, [C08y];
+--:-:4:-:1  @P3 LDG.E.U16 d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:0      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*32>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*32>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*32>];
+</SCHEDULE_BLOCK>
+
+01:-:1:-:1  @P5 F2F.F32.F16 d0, d0;
+02:-:2:-:1  @P5 F2F.F32.F16 d1, d1;
+04:-:3:-:1  @P5 F2F.F32.F16 d2, d2;
+08:-:4:-:1  @P5 F2F.F32.F16 d3, d3;
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P5 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+01:1:-:-:1  @P0 STG.E.CG.U16 [C00y], c0;
+02:2:-:-:1  @P1 STG.E.CG.U16 [C04y], c1;
+04:3:-:-:1  @P2 STG.E.CG.U16 [C08y], c2;
+08:4:-:-:1  @P3 STG.E.CG.U16 [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/hgemm_common_128x64.sass b/Kernel/SGEMM/Pascal/hgemm_common_128x64.sass
new file mode 100644
index 0000000..a375c03
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_common_128x64.sass
@@ -0,0 +1,318 @@
+# hgemm_common_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 +  00>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 +  32>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 +  00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 +  32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $yield  = $c == 32 ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:1:-:1      S2R threadId, SR_TID.X;
+--:-:2:-:1      S2R blockA, SR_CTAID.Y;
+--:-:3:-:1      S2R blockB, SR_CTAID.Z;
+--:-:4:-:1      S2R blockZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 64 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 4;
+
+// readCs = ((threadId & 96) << 3) | (threadId & 31)   << 2;
+01:-:-:-:1      LOP.AND tid31,  threadId,  31;
+01:-:-:-:1      LOP.AND tid96,  threadId,  96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 3;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx00 = blkB*64 + tid31;
+04:-:-:-:1      ISCADD cx00, blockB, tid31, 6;
+--:-:-:-:1      IADD   cx32, cx00, 32;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+02:-:-:-:1      ISCADD  cy00, blockA, cy00, 7;
+
+// C += (cy*ldc + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc, cx00, xmad_c;
+08:-:-:-:1      XMAD.LO2 ci, ldcz, blockZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 1;
+
+--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+
+--:-:-:-:1      SHL  ldc1, ldc, 1;
+--:-:-:-:1      SHL  ldc4, ldc, 3;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 7;
+
+</SCHEDULE_BLOCK>
+
+//--:-:1:-:2      I2F.F32.U32 temp, threadId;
+//01:-:-:-:1      F2F.F16.F32 temp, temp;
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.S16 d0, [C00y0 + 2x<00>];
+--:-:2:-:1  @P1 LDG.E.S16 d1, [C00y0 + 2x<32>];
+--:-:3:-:1  @P2 LDG.E.S16 d2, [C04y0 + 2x<00>];
+--:-:4:-:1  @P3 LDG.E.S16 d3, [C04y0 + 2x<32>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c4, c4, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c5, c5, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c6, c6, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c7, c7, RZ, !PT;
+
+--:-:-:-:5      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], c4;
+
+--:-:-:-:0      IADD cy00, cy00, 1;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<0*64 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<1*64 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<1*64 + 32>];
+
+--:-:-:-:0      IADD cy04, cy04, 1;
+
+01:-:1:-:1  @P6 F2F.F32.F16 d0, d0;
+02:-:2:-:1  @P6 F2F.F32.F16 d1, d1;
+04:-:3:-:1  @P6 F2F.F32.F16 d2, d2;
+08:-:4:-:1  @P6 F2F.F32.F16 d3, d3;
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P6 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, P6;
+
+// Stochastic Round flag
+--:-:-:-:1      LOP.AND.NZ   P6, RZ, flags, 1;
+
+01:-:-:-:1  @P0 STG.E.S16 [C00y0 + 2x<00>], c0;
+02:5:-:-:1  @P1 STG.E.S16 [C00y0 + 2x<32>], c1;
+04:-:-:-:1  @P2 STG.E.S16 [C04y0 + 2x<00>], c2;
+08:6:-:-:1  @P3 STG.E.S16 [C04y0 + 2x<32>], c3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.S16 d0, [C08y0 + 2x<00>];
+--:-:2:-:1  @P1 LDG.E.S16 d1, [C08y0 + 2x<32>];
+--:-:3:-:1  @P2 LDG.E.S16 d2, [C12y0 + 2x<00>];
+--:-:4:-:1  @P3 LDG.E.S16 d3, [C12y0 + 2x<32>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, PT;
+
+--:-:-:-:2      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:2      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+</SCHEDULE_BLOCK>
+
+10:-:-:-:4      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD   cy08, cy08, 1;
+--:-:-:-:1      IADD   cy12, cy12, 1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+20:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:0      IADD.X C04y1,    C04y1, RZ;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<2*64 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<3*64 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*64 + 32>];
+
+01:-:1:-:1  @P6 F2F.F32.F16 d0, d0;
+02:-:2:-:1  @P6 F2F.F32.F16 d1, d1;
+04:-:3:-:1  @P6 F2F.F32.F16 d2, d2;
+08:-:4:-:1  @P6 F2F.F32.F16 d3, d3;
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P6 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+01:-:-:-:1  @P0 STG.E.S16 [C08y0 + 2x<00>], c0;
+02:5:-:-:1  @P1 STG.E.S16 [C08y0 + 2x<32>], c1;
+04:-:-:-:1  @P2 STG.E.S16 [C12y0 + 2x<00>], c2;
+08:6:-:-:1  @P3 STG.E.S16 [C12y0 + 2x<32>], c3;
+
+10:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+20:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/hgemm_common_32x128.sass b/Kernel/SGEMM/Pascal/hgemm_common_32x128.sass
new file mode 100644
index 0000000..3661b08
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_common_32x128.sass
@@ -0,0 +1,244 @@
+# Kernel: hgemm_common_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32  + 16 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*32  + 16 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+    
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32  + 16 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 128 + readBs;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;
+
+// readCs = tid * 4;
+--:-:-:-:1      SHL readCs, tid, 2;
+
+// cx = blkB*128 + tid;
+--:-:-:-:1      ISCADD cx, blkB, tid, 7;
+
+// cy = blkA*32
+--:-:-:-:1      SHL cy00, blkA, 5;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 1;
+
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P4, RZ, flags, 2;
+
+--:-:-:-:1      SHL  ldc1, ldc, 1;
+--:-:-:-:1      SHL  ldc4, ldc, 3;
+--:-:-:-:1      ISCADD ldc12, ldc, -ldc4, 5;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  12;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  12;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  12;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  12;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E.U16 d0, [C00y];
+--:-:2:-:1  @P1 LDG.E.U16 d1, [C04y];
+--:-:3:-:1  @P2 LDG.E.U16 d2, [C08y];
+--:-:4:-:1  @P3 LDG.E.U16 d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:0      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*128>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*128>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*128>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*128>];
+</SCHEDULE_BLOCK>
+
+01:-:1:-:1  @P5 F2F.F32.F16 d0, d0;
+02:-:2:-:1  @P5 F2F.F32.F16 d1, d1;
+04:-:3:-:1  @P5 F2F.F32.F16 d2, d2;
+08:-:4:-:1  @P5 F2F.F32.F16 d3, d3;
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P5 FFMA c3, d3, beta, c3;
+
+--:-:1:-:1      F2F.F16.F32 c0, c0;
+--:-:2:-:1      F2F.F16.F32 c1, c1;
+--:-:3:-:1      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+01:1:-:-:1  @P0 STG.E.CG.U16 [C00y], c0;
+02:2:-:-:1  @P1 STG.E.CG.U16 [C04y], c1;
+04:3:-:-:1  @P2 STG.E.CG.U16 [C08y], c2;
+08:4:-:-:1  @P3 STG.E.CG.U16 [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_128x128.sass b/Kernel/SGEMM/Pascal/hgemm_nn_128x128.sass
new file mode 100644
index 0000000..0b4f460
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nn_128x128.sass
@@ -0,0 +1,393 @@
+# Kernel: hgemm_nn_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+our $int16;
+our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+sub convert_in {return $convert;}
+
+sub int16_params {
+    return $int16 ? q{
+param_Stats[0]  : c[0x0][0x190]
+param_Stats[1]  : c[0x0][0x194]
+param_scale     : c[0x0][0x198]
+    } : "";
+}
+-]
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+    [+ int16_params() +]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, tid31, tid128, txa, xmad_ta, xmad_tb, k<1-3>, x<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-105  : loadB<0-3>, loadA<0-5>
+
+    106-109 : trackA<0-1>, trackB<0-1>
+
+    110-118 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop
+    119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ
+
+    64-75   ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128
+
+    64-79   : c<0-7>, d3, d2, d1, d0, cs<0-3>
+    64-65   : Stats<0-1>
+    80-89   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    90-118  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs
+
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,   param_k;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      MOV loop, RZ;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
++]
+
+01:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+
+// tidAY  = (tid & 1) << 2
+--:-:-:-:1      LOP.AND tid1,  tid,  1;
+--:-:-:-:1      SHL     tidAY, tid1, 2;
+
+// tidAX = tid >> 1
+--:-:-:-:1      SHR.U32 tidAX, tid,   1;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ,  ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5) & 7
+--:-:-:-:1      SHL     tidBX, tid31, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,  0x305; // 3 bits at position 5
+
+// trackB += (blkB*128 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 7;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeAs = 4 * (128 * tidAY + tidAX)
+--:-:-:-:1      ISCADD  writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD  writeAs, writeAs, 4x<128*8*2>, 2;
+
+
+// writeBs = (128*tidBY + tidBX) * 4
+--:-:-:-:1      ISCADD  writeBs, tidBY, tidBX, 7;
+--:-:-:-:1      ISCADD  writeBs, writeBs, 4x<128*8*3>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:-:-:2      ISETP.LT.AND P3, PT, tidBY, k, P6;
+--:-:-:Y:b      ISETP.LT.AND P2, PT, tidAY, k, P5;
+
+--:-:4:-:2  @P3 LDG.E.CI.64 loadB0, [trackB];
+--:-:2:-:1  @P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];
+--:-:2:-:1  @P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];
+
+--:-:-:-:0      PSETP.AND.AND P4, PT, PT, PT, PT;
+
+--:-:5:-:1 @!P3 LDS.U.64 loadB0, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.64 loadA0, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.64 loadA4, [addr_zero];
+    } : q{
+
+<SCHEDULE_BLOCK>
+// doLoad0 = tidBY < k
+--:-:-:-:1      IADD x1, txb, 1;
+--:-:-:-:1      IADD x2, txb, 2;
+--:-:-:-:1      IADD x3, txb, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_n, P0;
+
+--:-:4:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>];
+--:-:4:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>];
+--:-:4:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>];
+--:-:4:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+
+--:-:-:-:1      IADD k1, tidAY, 1;
+--:-:-:-:1      IADD k2, tidAY, 2;
+--:-:-:-:1      IADD k3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+</SCHEDULE_BLOCK>
+    };
++]
+
+[+
+    our $vec;
+    our $convert;
+    return $vec ? qq{
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 7;
+
+18:-:-:-:4      $convert loadB3, loadB1.H1;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      $convert loadB2, loadB1.H0;
+--:-:-:-:4      $convert loadB1, loadB0.H1;
+--:-:4:-:2      $convert loadB0, loadB0.H0;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs], loadB0;
+
+22:-:-:-:4      $convert loadA3, loadA1.H1;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<16>;
+--:-:2:-:4      $convert loadA2, loadA1.H0;
+--:-:-:-:4      $convert loadA1, loadA0.H1;
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+--:-:3:-:1      $convert loadA0, loadA0.H0;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<3*128>], loadA3;
+--:-:-:-:1      STS [writeAs + 4x<2*128>], loadA2;
+04:-:-:-:1      STS [writeAs + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<0*128>], loadA0;
+    } : qq{
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+08:-:-:-:4      $convert loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      $convert loadB1, loadB1;
+--:-:-:-:4      $convert loadB2, loadB2;
+--:-:4:-:2      $convert loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs], loadB0;
+
+02:-:-:-:4      $convert loadA0, loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<8>;
+--:-:2:-:4      $convert loadA1, loadA1;
+--:-:-:-:4      $convert loadA2, loadA2;
+--:-:3:-:1      $convert loadA3, loadA3;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeAs + 4x<1*128>], loadA1;
+04:-:-:-:1      STS [writeAs + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*128>], loadA3;
+    };
++]
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      LOP.XOR writeAs, writeAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR writeBs, writeBs, 4x<128*8*2>;
+
+
+
+[+
+    our $vec;
+    our $convert;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P4, PT, !P4, PT, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND  P0, PT, k, $k_end, PT;\n",
+        j0c15 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P4, P5;\n",
+
+        j0c10 => "--:-:2:-:1  \@P3 LDG.E.CI.64 loadB0, [trackB];\n",
+
+        j0c28 => "--:-:5:-:1  \@P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];\n",
+        j0c30 => "20:4:6:-:1  \@P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];\n",
+
+        j4c5  => "--:-:-:-:1 \@!P4 $convert loadA3, loadA5.H1;\n",
+        j4c9  => "--:-:-:-:1 \@!P4 $convert loadA2, loadA5.H0;\n",
+        j4c13 => "--:-:-:-:1 \@!P4 $convert loadA1, loadA4.H1;\n",
+        j4c17 => "--:-:-:-:1 \@!P4 $convert loadA0, loadA4.H0;\n",
+
+        j5c5  => "02:-:-:-:1  \@P0 $convert loadB3, loadB1.H1;\n",
+        j5c9  => "--:-:-:-:1  \@P0 $convert loadB2, loadB1.H0;\n",
+        j5c13 => "--:-:-:-:1  \@P0 $convert loadB1, loadB0.H1;\n",
+        j5c17 => "--:-:2:-:1  \@P0 $convert loadB0, loadB0.H0;\n",
+
+        j5c35 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c5  => "10:-:2:-:1  \@P4 $convert loadA3, loadA1.H1;\n",
+        j6c9  => "--:-:3:-:1  \@P4 $convert loadA2, loadA1.H0;\n",
+        j6c13 => "--:-:4:-:1  \@P4 $convert loadA1, loadA0.H1;\n",
+        j6c17 => "--:-:5:-:1  \@P4 $convert loadA0, loadA0.H0;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeAs + 4x<3*128>], loadA3;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128>], loadA0;\n",
+
+        j6c11 => "08:-:-:-:1  \@P4 IADD   trackA0.CC, trackA0, 2x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P4 IADD.X trackA1,    trackA1, RZ;\n",
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        j0c10 => "--:-:2:-:1  \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j0c29 => "--:-:6:-:1  \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n",
+        j0c31 => "--:-:6:-:1  \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n",
+        j0c33 => "--:-:6:-:1  \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n",
+        j0c35 => "--:-:6:-:1  \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n",
+
+        j5c8  => "02:-:-:-:1  \@P3 $convert loadB0, loadB0;\n",
+        j5c12 => "--:-:-:-:1  \@P3 $convert loadB1, loadB1;\n",
+        j5c16 => "--:-:-:-:1  \@P3 $convert loadB2, loadB2;\n",
+        j5c20 => "--:-:2:-:1  \@P3 $convert loadB3, loadB3;\n",
+
+        j5c39 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c5  => "20:-:2:-:1  \@P2 $convert loadA0, loadA0;\n",
+        j6c9  => "--:-:3:-:1  \@P2 $convert loadA1, loadA1;\n",
+        j6c13 => "--:-:4:-:1  \@P2 $convert loadA2, loadA2;\n",
+        j6c17 => "--:-:5:-:1  \@P2 $convert loadA3, loadA3;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeAs + 4x<0*128>], loadA0;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS [writeAs + 4x<3*128>], loadA3;\n",
+
+        j6c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 2x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+            )
+        ),
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:0      IADD32I k, k, -8;\n" .
+                 "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
++]
+
+<INCLUDE file="hgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_128x32.sass b/Kernel/SGEMM/Pascal/hgemm_nn_128x32.sass
new file mode 100644
index 0000000..33a4a9a
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nn_128x32.sass
@@ -0,0 +1,590 @@
+# Kernel: hgemm_nn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + 32*16*2>
+    szShareA  : (128*16 + 32)
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, shiftAX
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb16, ldb, 5;
+--:-:-:-:1      SHL lda32, lda, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD txa00, blkA, tidAX, 7;
+--:-:-:-:1      IADD   txa32, txa00, 32;
+--:-:-:-:1      IADD   txa64, txa00, 64;
+--:-:-:-:1      IADD   txa96, txa00, 96;
+
+--:-:-:-:1      XMAD.LO  ta00, lda,  txa00,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta00, ldaz, blkZ,  ta00;
+--:-:-:-:1      IADD     ta32, ta00, lda32;
+--:-:-:-:1      IADD     ta64, ta32, lda32;
+--:-:-:-:1      IADD     ta96, ta64, lda32;
+
+--:-:-:-:1      LEA      track0A0.CC, ta00, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta00, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track1A0.CC, ta32, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track1A1,    ta32, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track2A0.CC, ta64, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track2A1,    ta64, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track3A0.CC, ta96, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track3A1,    ta96, param_A[1], RZ, 1;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 1;
+
+// writeAs = (tidAY*128 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.CI.64 load0A, [track0A];
+--:-:2:-:1  @P3 LDG.E.CI.64 load1A, [track1A];
+--:-:3:-:1  @P4 LDG.E.CI.64 load2A, [track2A];
+--:-:4:-:1  @P5 LDG.E.CI.64 load3A, [track3A];
+--:-:5:-:1  @P6 LDG.E.CI.64 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.64 load0A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 load1A, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.64 load2A, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.64 load3A, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.64 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:1:-:1  @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:1:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:1:-:1  @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];
+--:-:5:-:1  @P1 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];
+--:-:5:-:1  @P2 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P1, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+21:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:1:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+
+02:-:-:-:1      F2F.F32.F16 load1A3, load1A1.H1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A1.H0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A0.H1;
+--:-:2:-:1      F2F.F32.F16 load1A0, load1A0.H0;
+
+04:-:-:-:1      F2F.F32.F16 load2A3, load2A1.H1;
+--:-:-:-:1      F2F.F32.F16 load2A2, load2A1.H0;
+--:-:-:-:1      F2F.F32.F16 load2A1, load2A0.H1;
+--:-:3:-:1      F2F.F32.F16 load2A0, load2A0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load3A3, load3A1.H1;
+--:-:-:-:1      F2F.F32.F16 load3A2, load3A1.H0;
+--:-:-:-:1      F2F.F32.F16 load3A1, load3A0.H1;
+--:-:4:-:1      F2F.F32.F16 load3A0, load3A0.H0;
+
+10:-:-:-:1      F2F.F32.F16 loadB3, loadB1.H1;
+--:-:-:-:1      F2F.F32.F16 loadB2, loadB1.H0;
+--:-:-:-:1      F2F.F32.F16 loadB1, loadB0.H1;
+--:-:5:-:1      F2F.F32.F16 loadB0, loadB0.H0;
+    } : q{
+21:-:-:-:1      F2F.F32.F16 load0A0, load0A0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:1:-:1      F2F.F32.F16 load0A3, load0A3;
+
+02:-:-:-:1      F2F.F32.F16 load1A0, load1A0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A2;
+--:-:2:-:1      F2F.F32.F16 load1A3, load1A3;
+
+04:-:-:-:1      F2F.F32.F16 load2A0, load2A0;
+--:-:-:-:1      F2F.F32.F16 load2A1, load2A1;
+--:-:-:-:1      F2F.F32.F16 load2A2, load2A2;
+--:-:3:-:1      F2F.F32.F16 load2A3, load2A3;
+
+08:-:-:-:1      F2F.F32.F16 load3A0, load3A0;
+--:-:-:-:1      F2F.F32.F16 load3A1, load3A1;
+--:-:-:-:1      F2F.F32.F16 load3A2, load3A2;
+--:-:4:-:1      F2F.F32.F16 load3A3, load3A3;
+
+10:-:-:-:1      F2F.F32.F16 loadB0, loadB0;
+--:-:-:-:1      F2F.F32.F16 loadB1, loadB1;
+--:-:-:-:1      F2F.F32.F16 loadB2, loadB2;
+--:-:5:-:1      F2F.F32.F16 loadB3, loadB3;
+    };
+</CODE>
+
+01:-:-:-:1      STS [writeAs + 4x<0*128 + 0*32>], load0A0;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 0*32>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 0*32>], load0A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 0*32>], load0A3;
+
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128 + 1*32>], load1A0;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 1*32>], load1A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 1*32>], load1A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 1*32>], load1A3;
+
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 2*32>], load2A0;
+--:-:-:-:0      IADD   track2A0.CC, track2A0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 2*32>], load2A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 2*32>], load2A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 2*32>], load2A3;
+
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 3*32>], load3A0;
+--:-:-:-:0      IADD   track3A0.CC, track3A0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 3*32>], load3A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 3*32>], load3A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 3*32>], load3A3;
+
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.64 load0A, [track0A];
+--:-:4:-:1  @P3 LDG.E.CI.64 load1A, [track1A];
+--:-:5:-:1  @P4 LDG.E.CI.64 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.CI.64 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.64 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n",
+
+        j9c6   => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n",
+
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 2x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2A0.CC, track2A0, 2x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, 2x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.64 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.64 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI.64 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.64 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.64 loadB,  [trackB];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1A3, load1A1.H1;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A2, load1A1.H0;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A1, load1A0.H1;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1A0, load1A0.H0;\n",
+
+                j6c13  => "10:-:-:-:1  \@P4 F2F.F32.F16 load2A3, load2A1.H1;\n",
+                j6c17  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2A2, load2A1.H0;\n",
+                j6c21  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2A1, load2A0.H1;\n",
+                j6c25  => "--:-:5:-:1  \@P4 F2F.F32.F16 load2A0, load2A0.H0;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A3, load3A1.H1;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A2, load3A1.H0;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A1, load3A0.H1;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3A0, load3A0.H0;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];\n",
+                j10c3  => "--:-:5:-:1  \@P4 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0A0, load0A0;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A1, load0A1;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A2, load0A2;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0A3, load0A3;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1A0, load1A0;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A1, load1A1;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A2, load1A2;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1A3, load1A3;\n",
+
+                j6c13  => "10:-:-:-:1  \@P4 F2F.F32.F16 load2A0, load2A0;\n",
+                j6c17  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2A1, load2A1;\n",
+                j6c21  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2A2, load2A2;\n",
+                j6c25  => "--:-:5:-:1  \@P4 F2F.F32.F16 load2A3, load2A3;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A0, load3A0;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A1, load3A1;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A2, load3A2;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3A3, load3A3;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB3;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB2;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB0, loadB0;\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="hgemm_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_128x64.sass b/Kernel/SGEMM/Pascal/hgemm_nn_128x64.sass
new file mode 100644
index 0000000..8e6c457
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nn_128x64.sass
@@ -0,0 +1,438 @@
+# Kernel: hgemm_nn_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tid, blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid7, txa, txa1, ta, xmad_ta, tb, xmad_tb, tidAY, tidBY, k<1-3>, x<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-111  : loadA<0-7>, loadAA<0-3>, loadB<0-3>
+
+    112-117 : track0A<0-1>, track1A<0-1>, trackB<0-1>
+
+    118-122 ~ writeAs, writeBs, k, txb, swapBuf
+    123-127 : readAs, readBs
+
+    64-83   ~ ldc, ldcz, ci, xmad_c, threadId, tid31, tid96, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C04y<0-1>, C08y<0-1>, C12y<0-1>, C00y<0-1>
+    86-107  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,   param_k;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tidAX = tid & 0xfe
+// tidAY = (tid & 1) << 2
+01:-:-:-:1      LOP.AND tidAX, tid, 0xfe;
+--:-:-:-:1      LOP.AND tid1,  tid,  1;
+--:-:-:-:1      SHL     tidAY, tid1, 2;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD  txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO ta, lda, txa, tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta,  param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta,  param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track1A0.CC, lda, track0A0,       1;
+--:-:-:-:1      LEA.HI.X track1A1,    lda, track0A1, RZ,   1;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa,  param_m, PT;
+--:-:-:-:1      IADD txa1, txa, 1;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa1, param_m, PT;
+
+// tidBX = (tid & 15) << 2
+// tidBY = (tid >> 4) & 7
+--:-:-:-:1      LOP.AND tidBX, tid,  15;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,  0x304; // 3 bits at position 4
+
+// trackB += (blkB*64 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     1;
+--:-:-:-:2      LEA.HI.X trackB1,    tb, param_B[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// Start the write buffers high
+// writeAs = (128*tidAY + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2;
+// writeBs = (64*tidBY + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2;
+
+// Start the read buffers low
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<64*8 + 128*8>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:6:-:1  @P6 LDG.E.CI.64 loadB0,  [trackB];
+
+--:-:2:-:1  @P5 LDG.E.CI.64 loadA2,  [track1A + 2x<0>];
+--:-:2:-:1  @P5 LDG.E.CI.64 loadAA2, [track1A + 2x<8>];
+
+--:-:3:-:1  @P4 LDG.E.CI.64 loadA0,  [track0A + 2x<0>];
+--:-:3:-:1  @P4 LDG.E.CI.64 loadAA0, [track0A + 2x<8>];
+
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+--:-:4:-:1 @!P6 LDS.U.64 loadB0,  [addr_zero];
+--:-:5:-:1 @!P5 LDS.U.64 loadA2,  [addr_zero];
+--:-:5:-:1 @!P4 LDS.U.64 loadA0,  [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.64 loadAA2, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.64 loadAA0, [addr_zero];
+    } : q{
+
+--:-:2:-:2      S2R tid,  SR_TID.X;
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1      LOP.AND tidAY, tid, 1;
+--:-:-:-:1      SHL     tidAY, tidAY, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid, 0x304; // 3 bits at position 4
+
+// doLoad0 = tidBY < k
+--:-:-:-:1      IADD x1, txb, 1;
+--:-:-:-:1      IADD x2, txb, 2;
+--:-:-:-:1      IADD x3, txb, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_n, P0;
+
+--:-:6:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];
+--:-:6:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];
+--:-:6:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];
+--:-:6:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+
+--:-:-:-:1      IADD k1, tidAY, 1;
+--:-:-:-:1      IADD k2, tidAY, 2;
+--:-:-:-:1      IADD k3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P4;
+
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [track0A + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA2, [track0A + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA4, [track0A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA6, [track0A + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA2, RZ;
+--:-:-:-:1 @!P2 MOV loadA4, RZ;
+--:-:-:-:1 @!P3 MOV loadA6, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:3:-:1  @P0 LDG.E.CI.S16 loadA1, [track1A + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.S16 loadA3, [track1A + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.S16 loadA5, [track1A + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.S16 loadA7, [track1A + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA1, RZ;
+--:-:-:-:1 @!P1 MOV loadA3, RZ;
+--:-:-:-:1 @!P2 MOV loadA5, RZ;
+--:-:-:-:1 @!P3 MOV loadA7, RZ;
+</SCHEDULE_BLOCK>
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+28:-:-:-:4      F2F.F32.F16 loadB3, loadB1.H1;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      F2F.F32.F16 loadB2, loadB1.H0;
+--:-:-:-:4      F2F.F32.F16 loadB1, loadB0.H1;
+--:-:4:-:2      F2F.F32.F16 loadB0, loadB0.H0;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs], loadB0;
+
+12:-:-:-:4      F2F.F32.F16 loadA7, loadA3.H1;
+04:-:2:-:4      F2F.F32.F16 loadA6, loadA1.H1;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 2x<16>;
+--:-:-:-:4      F2F.F32.F16 loadA5, loadA3.H0;
+--:-:3:-:4      F2F.F32.F16 loadA4, loadA1.H0;
+--:-:-:-:0      IADD.X track0A1, track0A1, RZ;
+--:-:-:-:4      F2F.F32.F16 loadA3, loadA2.H1;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 2x<16>;
+--:-:-:-:4      F2F.F32.F16 loadA1, loadA2.H0;
+--:-:4:-:4      F2F.F32.F16 loadA2, loadA0.H1;
+--:-:5:-:1      F2F.F32.F16 loadA0, loadA0.H0;
+
+--:-:-:-:0      IADD.X track1A1, track1A1, RZ;
+
+02:-:-:-:1      STS.64 [writeAs + 4x<3*128>], loadA6;
+04:-:-:-:1      STS.64 [writeAs + 4x<2*128>], loadA4;
+08:-:-:-:1      STS.64 [writeAs + 4x<1*128>], loadA2;
+10:-:-:-:1      STS.64 [writeAs + 4x<0*128>], loadA0;
+
+    } : q{
+
+20:-:-:-:4      F2F.F32.F16 loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      F2F.F32.F16 loadB1, loadB1;
+--:-:-:-:4      F2F.F32.F16 loadB2, loadB2;
+--:-:6:-:2      F2F.F32.F16 loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+20:-:-:-:1      STS.128 [writeBs], loadB0;
+
+02:-:-:-:4      F2F.F32.F16 loadA0, loadA0;
+04:-:2:-:4      F2F.F32.F16 loadA1, loadA1;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 2x<8>;
+--:-:-:-:4      F2F.F32.F16 loadA2, loadA2;
+--:-:3:-:4      F2F.F32.F16 loadA3, loadA3;
+--:-:-:-:0      IADD.X track0A1, track0A1, RZ;
+--:-:-:-:4      F2F.F32.F16 loadA4, loadA4;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 2x<8>;
+--:-:4:-:4      F2F.F32.F16 loadA5, loadA5;
+--:-:-:-:4      F2F.F32.F16 loadA6, loadA6;
+--:-:-:-:0      IADD.X track1A1, track1A1, RZ;
+--:-:5:-:1      F2F.F32.F16 loadA7, loadA7;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+02:-:-:-:1      STS.64 [writeAs + 4x<0*128>], loadA0;
+04:-:-:-:1      STS.64 [writeAs + 4x<1*128>], loadA2;
+08:-:-:-:1      STS.64 [writeAs + 4x<2*128>], loadA4;
+10:-:-:-:1      STS.64 [writeAs + 4x<3*128>], loadA6;
+    };
+</CODE>
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND  P0, PT, k, $k_end, PT;\n",
+        j0c15 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P5;\n",
+
+        j0c10 => "--:-:2:-:1  \@P3 LDG.E.CI.64 loadB0, [trackB];\n",
+
+        j0c28 => "--:-:4:-:1  \@P2 LDG.E.CI.64 loadA2,  [track1A + 2x<0>];\n",
+        j0c30 => "--:-:4:-:1  \@P2 LDG.E.CI.64 loadAA2, [track1A + 2x<8>];\n",
+
+        j0c31 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P4;\n",
+
+        j0c44 => "--:-:5:-:1  \@P2 LDG.E.CI.64 loadA0,  [track0A + 2x<0>];\n",
+        j0c46 => "--:-:6:-:1  \@P2 LDG.E.CI.64 loadAA0, [track0A + 2x<8>];\n",
+
+        j3c53 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA7, loadAA3.H1;\n",
+        j3c57 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA6, loadAA1.H1;\n",
+        j3c61 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA5, loadAA3.H0;\n",
+        j4c1  => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA4, loadAA1.H0;\n",
+        j4c5  => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA3, loadAA2.H1;\n",
+        j4c9  => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA1, loadAA2.H0;\n",
+        j4c13 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA2, loadAA0.H1;\n",
+        j4c17 => "--:-:-:-:1 \@!P1 F2F.F32.F16 loadA0, loadAA0.H0;\n",
+
+        j5c5  => "02:-:-:-:1  \@P3 F2F.F32.F16 loadB3, loadB1.H1;\n",
+        j5c9  => "--:-:-:-:1  \@P3 F2F.F32.F16 loadB2, loadB1.H0;\n",
+        j5c13 => "--:-:-:-:1  \@P3 F2F.F32.F16 loadB1, loadB0.H1;\n",
+        j5c17 => "--:-:2:-:1  \@P3 F2F.F32.F16 loadB0, loadB0.H0;\n",
+
+        j5c35 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j5c53 => "08:-:-:-:1  \@P1 F2F.F32.F16 loadA7, loadA3.H1;\n",
+        j5c57 => "10:-:2:-:1  \@P1 F2F.F32.F16 loadA6, loadA1.H1;\n",
+        j5c61 => "--:-:-:-:1  \@P1 F2F.F32.F16 loadA5, loadA3.H0;\n",
+        j6c1  => "--:-:3:-:1  \@P1 F2F.F32.F16 loadA4, loadA1.H0;\n",
+        j6c5  => "--:-:-:-:1  \@P1 F2F.F32.F16 loadA3, loadA2.H1;\n",
+        j6c9  => "--:-:-:-:1  \@P1 F2F.F32.F16 loadA1, loadA2.H0;\n",
+        j6c13 => "--:-:4:-:1  \@P1 F2F.F32.F16 loadA2, loadA0.H1;\n",
+        j6c17 => "--:-:5:-:1  \@P1 F2F.F32.F16 loadA0, loadA0.H0;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS.64 [writeAs + 4x<3*128>], loadA6;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS.64 [writeAs + 4x<2*128>], loadA4;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS.64 [writeAs + 4x<1*128>], loadA2;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS.64 [writeAs + 4x<0*128>], loadA0;\n",
+
+        j6c46 => "--:-:-:-:1  \@P1 IADD   track1A0.CC, track1A0, 2x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P1 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c55 => "20:-:-:-:1  \@P1 IADD   track0A0.CC, track0A0, 2x<16>;\n",
+        j7c61 => "--:-:-:-:1  \@P1 IADD.X track0A1,    track0A1, RZ;\n",
+
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P4;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        j0c10 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j0c12 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j0c14 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j0c16 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j0c33 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA0, [track0A + 2x<0>];\n",
+        j0c35 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA2, [track0A + 2x<1>];\n",
+        j0c37 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA4, [track0A + 2x<2>];\n",
+        j0c39 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA6, [track0A + 2x<3>];\n",
+
+        j0c41 => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+
+        j1c29 => "--:-:3:-:1  \@P2 LDG.E.CI.S16 loadA1, [track1A + 2x<0>];\n",
+        j1c31 => "--:-:3:-:1  \@P2 LDG.E.CI.S16 loadA3, [track1A + 2x<1>];\n",
+        j1c33 => "--:-:3:-:1  \@P2 LDG.E.CI.S16 loadA5, [track1A + 2x<2>];\n",
+        j1c35 => "--:-:3:-:1  \@P2 LDG.E.CI.S16 loadA7, [track1A + 2x<3>];\n",
+
+        j5c8  => "20:-:-:-:1  \@P3 F2F.F32.F16 loadB0, loadB0;\n",
+        j5c12 => "--:-:-:-:1  \@P3 F2F.F32.F16 loadB1, loadB1;\n",
+        j5c16 => "--:-:-:-:1  \@P3 F2F.F32.F16 loadB2, loadB2;\n",
+        j5c20 => "--:-:6:-:1  \@P3 F2F.F32.F16 loadB3, loadB3;\n",
+
+        j5c39 => "20:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j5c53 => "02:-:-:-:1  \@P0 F2F.F32.F16 loadA0, loadA0;\n",
+        j5c57 => "04:-:2:-:1  \@P0 F2F.F32.F16 loadA1, loadA1;\n",
+        j5c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 loadA2, loadA2;\n",
+        j6c1  => "--:-:3:-:1  \@P0 F2F.F32.F16 loadA3, loadA3;\n",
+        j6c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 loadA4, loadA4;\n",
+        j6c9  => "--:-:4:-:1  \@P0 F2F.F32.F16 loadA5, loadA5;\n",
+        j6c13 => "--:-:-:-:1  \@P0 F2F.F32.F16 loadA6, loadA6;\n",
+        j6c17 => "--:-:5:-:1  \@P0 F2F.F32.F16 loadA7, loadA7;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS.64 [writeAs + 4x<0*128>], loadA0;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS.64 [writeAs + 4x<1*128>], loadA2;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS.64 [writeAs + 4x<2*128>], loadA4;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS.64 [writeAs + 4x<3*128>], loadA6;\n",
+
+        j6c46 => "--:-:-:-:1  \@P0 IADD   track0A0.CC, track0A0, 2x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P0 IADD.X track0A1,    track0A1, RZ;\n",
+        j6c55 => "--:-:-:-:1  \@P0 IADD   track1A0.CC, track1A0, 2x<8>;\n",
+        j6c61 => "--:-:-:-:1  \@P0 IADD.X track1A1,    track1A1, RZ;\n",
+            )
+        ),
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:0      IADD32I k, k, -8;\n" .
+                 "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n") :
+            (j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n")
+        ),
+    );
+    return;
+</CODE>
+
+<INCLUDE file="hgemm_common_128x64.sass"/>
diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_16x64.sass b/Kernel/SGEMM/Pascal/hgemm_nn_16x64.sass
new file mode 100644
index 0000000..1dfb949
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nn_16x64.sass
@@ -0,0 +1,1171 @@
+# Kernel: hgemm_nn_16x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(16*64 + 32)*2 + 64*64*2>
+    szShareA   : (16*64 + 32)
+    szShareB   : (64*64)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+      64-79 : j0Ay<0-7>, j0Bx<0-7>
+      80-95 : j1Ay<0-7>, j1Bx<0-7>
+
+      64-95 ~ lda, ldb, ldb8, tidAX, tidAY, tidBX, tidBY, tidAY<1-3>, tidBY<8|16|24|32|40|48|56>, tid16_8, tb, shiftAX, partialK, partialB, ldaz, ldbz, ta, txa, txb, txb<1-3>, xmad_ta
+
+     96-135 :  load0A<0-7>,  load0B<0-3>,  load1B<0-3>,  load2B<0-3>,  load3B<0-3>,  load4B<0-3>,  load5B<0-3>,  load6B<0-3>,  load7B<0-3>
+    136-153 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>, track4B<0-1>, track5B<0-1>, track6B<0-1>, track7B<0-1>
+
+    154-161 ~ swapBuf, readAs, readBs, writeAs, writeBs, k, ldb64
+    162-171 ~ tid, blkA, blkB, blkZ, writeCs, preds, tid16
+
+       0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3>
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+      96-99 : loadC<0-3>
+    100-103 : b<0-3>
+    104-107 : c<0-3>
+    108-109 : C<0-1>
+    110-161 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb8,  ldb, 3;
+--:-:-:-:1      SHL ldb64, ldb, 7;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// tidAX   = tid >> 3
+// tidAY   = (tid & 7) << 3
+// shiftAX = (tid & 7) << 2
+01:-:-:-:1      SHR.U32 tidAX,   tid,   3;
+--:-:-:-:1      LOP.AND tidAY,   tid,   7;
+--:-:-:-:1      SHL     shiftAX, tidAY, 2;
+--:-:-:-:1      SHL     tidAY,   tidAY, 3;
+
+// tidBX   = (tid & 15) << 2
+// tidBY   = tid >> 4
+01:-:-:-:1      LOP.AND tidBX, tid,  15;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+01:-:-:-:1      SHR.U32 tidBY, tid,   4;
+
+--:-:-:-:1      IADD tidBY8,  tidBY, 8;
+--:-:-:-:1      IADD tidBY16, tidBY, 16;
+--:-:-:-:1      IADD tidBY24, tidBY, 24;
+--:-:-:-:1      IADD tidBY32, tidBY, 32;
+--:-:-:-:1      IADD tidBY40, tidBY, 40;
+--:-:-:-:1      IADD tidBY48, tidBY, 48;
+--:-:-:-:1      IADD tidBY56, tidBY, 56;
+
+// trackA += ((blkA*16 + tidAX) * lda + tidAY) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 4;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa, tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa, param_m, PT;
+
+// trackB += (blkB*64 + tidBX + ldb*tidBY) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
+--:-:-:-:1      XMAD.LO2 tb, ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb, ldbz, blkZ,  tb;
+
+--:-:-:-:1      LEA      track0B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track1B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track2B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track3B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track4B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track4B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track5B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track5B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track6B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track6B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb, tb, ldb8;
+--:-:-:-:1      LEA      track7B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track7B1,    tb, param_B[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb, param_n, PT;
+[+
+    our $vec;
+    return $vec ? '' : q{
+--:-:-:-:1      IADD txb1, txb, 1;
+--:-:-:-:1      IADD txb2, txb, 2;
+--:-:-:-:1      IADD txb3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb3, param_n, PT;
+    };
++]
+--:-:-:-:1      P2R preds, PR, RZ, 0x7c;
+
+// writeAs = (tidAY*16 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 4;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (tidBY*64 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA>, 2;
+
+// readAs = (tid & 1) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHL     readBs, readBs, 4;
+
+// tid16 = tid & -16
+// tid16_8 = tid16 / 2 * 4
+--:-:-:-:1      LOP.AND tid16,   tid,  -16;
+--:-:-:-:1      SHL     tid16_8, tid16, 1;
+
+// writeCs = (readAs + tid16*2) * 64 + readBs;
+--:-:-:-:1      ISCADD writeCs, tid16,   readAs, 1;
+--:-:-:-:1      ISCADD writeCs, writeCs, readBs, 6;
+
+// Each block of 16 threads works on 8 lines, readAs is also shifted over by 4
+// readAs += tid16_8 * 16 + tid16
+// readBs += tid16_8 * 64 + 4x<szShareA>
+--:-:-:-:1      ISCADD readAs, tid16_8, readAs, 4;
+--:-:-:-:1      ISCADD readBs, tid16_8, readBs, 6;
+--:-:-:-:1      IADD   readAs, tid16, readAs;
+--:-:-:-:1      IADD   readBs, readBs, 4x<szShareA>;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareA + szShareB>;
+
+// If k is not a multiple of 64 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 64 then make a full 64 line fetch.
+--:-:-:-:1      LOP.AND.Z P0, partialK, k, 63;
+--:-:-:-:1  @P0 MOV partialK, 64;
+--:-:-:-:1      IADD k, k, -partialK;
+[+
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY,   partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY,   partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY8,  partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY16, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY24, partialK, P3;
+<ORDERED>
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P1 LDG.E.CI.64  load0B, [track0B];
+--:-:3:-:1  @P4 LDG.E.CI.64  load1B, [track1B];
+--:-:4:-:1  @P5 LDG.E.CI.64  load2B, [track2B];
+--:-:4:-:1  @P6 LDG.E.CI.64  load3B, [track3B];
+</ORDERED>
+<ORDERED>
+--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:-:-:1 @!P1 LDS.U.64  load0B, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.64  load1B, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.64  load2B, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.64  load3B, [addr_zero];
+</ORDERED>
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY32, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY40, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY48, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY56, partialK, P3;
+<ORDERED>
+--:-:5:-:1  @P1 LDG.E.CI.64  load4B, [track4B];
+--:-:5:-:1  @P4 LDG.E.CI.64  load5B, [track5B];
+--:-:6:-:1  @P5 LDG.E.CI.64  load6B, [track6B];
+--:-:6:-:1  @P6 LDG.E.CI.64  load7B, [track7B];
+</ORDERED>
+<ORDERED>
+--:-:-:-:1 @!P1 LDS.U.64  load4B, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.64  load5B, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.64  load6B, [addr_zero];
+--:-:1:-:1 @!P6 LDS.U.64  load7B, [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:2:-:1  @P6 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0A0, RZ;
+--:-:-:-:1 @!P4 MOV load0A1, RZ;
+--:-:-:-:1 @!P5 MOV load0A2, RZ;
+--:-:-:-:1 @!P6 MOV load0A3, RZ;
+
+--:-:-:-:1      IADD tidAY,  tidAY,  4;
+--:-:-:-:1      IADD tidAY1, tidAY1, 4;
+--:-:-:-:1      IADD tidAY2, tidAY2, 4;
+--:-:-:-:1      IADD tidAY3, tidAY3, 4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P6 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0A4, RZ;
+--:-:-:-:1 @!P4 MOV load0A5, RZ;
+--:-:-:-:1 @!P5 MOV load0A6, RZ;
+--:-:-:-:1 @!P6 MOV load0A7, RZ;
+
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0B0, RZ;
+--:-:-:-:1 @!P4 MOV load0B1, RZ;
+--:-:-:-:1 @!P5 MOV load0B2, RZ;
+--:-:-:-:1 @!P6 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY8, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P1 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load1B0, RZ;
+--:-:-:-:1 @!P4 MOV load1B1, RZ;
+--:-:-:-:1 @!P5 MOV load1B2, RZ;
+--:-:-:-:1 @!P6 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY16, partialK, PT;
+--:-:-:-:1  @P2 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P2 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load2B0, RZ;
+--:-:-:-:1 @!P4 MOV load2B1, RZ;
+--:-:-:-:1 @!P5 MOV load2B2, RZ;
+--:-:-:-:1 @!P6 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY24, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load3B0, RZ;
+--:-:-:-:1 @!P4 MOV load3B1, RZ;
+--:-:-:-:1 @!P5 MOV load3B2, RZ;
+--:-:-:-:1 @!P6 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY32, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P1 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load4B0, RZ;
+--:-:-:-:1 @!P4 MOV load4B1, RZ;
+--:-:-:-:1 @!P5 MOV load4B2, RZ;
+--:-:-:-:1 @!P6 MOV load4B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY40, partialK, PT;
+--:-:-:-:1  @P2 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P2 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load5B0, RZ;
+--:-:-:-:1 @!P4 MOV load5B1, RZ;
+--:-:-:-:1 @!P5 MOV load5B2, RZ;
+--:-:-:-:1 @!P6 MOV load5B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY48, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load6B0, RZ;
+--:-:-:-:1 @!P4 MOV load6B1, RZ;
+--:-:-:-:1 @!P5 MOV load6B2, RZ;
+--:-:-:-:1 @!P6 MOV load6B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY56, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P1 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load7B0, RZ;
+--:-:-:-:1 @!P4 MOV load7B1, RZ;
+--:-:-:-:1 @!P5 MOV load7B2, RZ;
+--:-:-:-:1 @!P6 MOV load7B3, RZ;
+    };
++]
+// partialB = partialK * ldb
+--:-:-:-:1      XMAD.LO2 partialB, ldb, partialK, RZ;
+
+--:-:-:-:1      ISETP.GE.AND P1, PT, k, 64, PT;
+--:-:-:-:1      IADD k, k, -64;
+--:-:-:-:1  @P1 R2P PR, preds, 0x7c;
+--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c;
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:1      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+    } : q{
+02:-:-:-:1      F2F.F32.F16 load0A7, load0A7;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A6;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A5;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A4;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A3;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0;
+    };
++]
+--:-:-:-:0      LEA track0A0.CC, partialK, track0A0, 1;
+01:-:-:-:1      STS [writeAs + 4x<7*16>], load0A7;
+--:-:-:-:1      STS [writeAs + 4x<6*16>], load0A6;
+--:-:-:-:1      STS [writeAs + 4x<5*16>], load0A5;
+--:-:-:-:1      STS [writeAs + 4x<4*16>], load0A4;
+02:-:-:-:1      STS [writeAs + 4x<3*16>], load0A3;
+--:-:-:-:1      STS [writeAs + 4x<2*16>], load0A2;
+--:-:-:-:1      STS [writeAs + 4x<1*16>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<0*16>], load0A0;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+04:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:1:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+    } : q{
+04:-:-:-:1      F2F.F32.F16 load0B0, load0B0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:1:-:1      F2F.F32.F16 load0B3, load0B3;
+--:-:-:-:1      F2F.F32.F16 load1B0, load1B0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:2:-:1      F2F.F32.F16 load1B3, load1B3;
+    };
++]
+--:-:-:-:0      LEA track0B0.CC, partialB, track0B0, 1;
+01:-:-:-:6      STS.128 [writeBs + 4x<0*64>], load0B;
+--:-:-:-:1      IADD.X track0B1, track0B1, RZ;
+
+--:-:-:-:0      LEA track1B0.CC, partialB, track1B0, 1;
+02:-:-:-:6      STS.128 [writeBs + 4x<8*64>], load1B;
+--:-:-:-:0      IADD.X track1B1, track1B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+08:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:1:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+--:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:2:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+    } : q{
+08:-:-:-:1      F2F.F32.F16 load2B0, load2B0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:1:-:1      F2F.F32.F16 load2B3, load2B3;
+--:-:-:-:1      F2F.F32.F16 load3B0, load3B0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:2:-:1      F2F.F32.F16 load3B3, load3B3;
+    };
++]
+--:-:-:-:0      LEA track2B0.CC, partialB, track2B0, 1;
+01:-:-:-:6      STS.128 [writeBs + 4x<16*64>], load2B;
+--:-:-:-:1      IADD.X track2B1, track2B1, RZ;
+
+--:-:-:-:0      LEA track3B0.CC, partialB, track3B0, 1;
+02:-:-:-:6      STS.128 [writeBs + 4x<24*64>], load3B;
+--:-:-:-:0      IADD.X track3B1, track3B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+10:-:-:-:1      F2F.F32.F16 load4B3, load4B1.H1;
+--:-:-:-:1      F2F.F32.F16 load4B2, load4B1.H0;
+--:-:-:-:1      F2F.F32.F16 load4B1, load4B0.H1;
+--:-:1:-:1      F2F.F32.F16 load4B0, load4B0.H0;
+--:-:-:-:1      F2F.F32.F16 load5B3, load5B1.H1;
+--:-:-:-:1      F2F.F32.F16 load5B2, load5B1.H0;
+--:-:-:-:1      F2F.F32.F16 load5B1, load5B0.H1;
+--:-:2:-:1      F2F.F32.F16 load5B0, load5B0.H0;
+    } : q{
+10:-:-:-:1      F2F.F32.F16 load4B0, load4B0;
+--:-:-:-:1      F2F.F32.F16 load4B1, load4B1;
+--:-:-:-:1      F2F.F32.F16 load4B2, load4B2;
+--:-:1:-:1      F2F.F32.F16 load4B3, load4B3;
+--:-:-:-:1      F2F.F32.F16 load5B0, load5B0;
+--:-:-:-:1      F2F.F32.F16 load5B1, load5B1;
+--:-:-:-:1      F2F.F32.F16 load5B2, load5B2;
+--:-:2:-:1      F2F.F32.F16 load5B3, load5B3;
+    };
++]
+--:-:-:-:0      LEA track4B0.CC, partialB, track4B0, 1;
+01:-:-:-:6      STS.128 [writeBs + 4x<32*64>], load4B;
+--:-:-:-:1      IADD.X track4B1, track4B1, RZ;
+
+--:-:-:-:0      LEA track5B0.CC, partialB, track5B0, 1;
+02:-:-:-:6      STS.128 [writeBs + 4x<40*64>], load5B;
+--:-:-:-:0      IADD.X track5B1, track5B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+20:-:-:-:1      F2F.F32.F16 load6B3, load6B1.H1;
+--:-:-:-:1      F2F.F32.F16 load6B2, load6B1.H0;
+--:-:-:-:1      F2F.F32.F16 load6B1, load6B0.H1;
+--:-:1:-:1      F2F.F32.F16 load6B0, load6B0.H0;
+--:-:-:-:1      F2F.F32.F16 load7B3, load7B1.H1;
+--:-:-:-:1      F2F.F32.F16 load7B2, load7B1.H0;
+--:-:-:-:1      F2F.F32.F16 load7B1, load7B0.H1;
+--:-:2:-:1      F2F.F32.F16 load7B0, load7B0.H0;
+    } : q{
+20:-:-:-:1      F2F.F32.F16 load6B0, load6B0;
+--:-:-:-:1      F2F.F32.F16 load6B1, load6B1;
+--:-:-:-:1      F2F.F32.F16 load6B2, load6B2;
+--:-:1:-:1      F2F.F32.F16 load6B3, load6B3;
+--:-:-:-:1      F2F.F32.F16 load7B0, load7B0;
+--:-:-:-:1      F2F.F32.F16 load7B1, load7B1;
+--:-:-:-:1      F2F.F32.F16 load7B2, load7B2;
+--:-:2:-:1      F2F.F32.F16 load7B3, load7B3;
+    };
++]
+--:-:-:-:0      LEA track6B0.CC, partialB, track6B0, 1;
+01:-:-:-:6      STS.128 [writeBs + 4x<48*64>], load6B;
+--:-:-:-:1      IADD.X track6B1, track6B1, RZ;
+
+--:-:-:-:0      LEA track7B0.CC, partialB, track7B0, 1;
+02:-:-:-:6      STS.128 [writeBs + 4x<56*64>], load7B;
+--:-:-:-:0      IADD.X track7B1, track7B1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*16 + 00>];
+--:-:-:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*16 + 08>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>];
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.64  load0B, [track0B];
+--:-:3:-:1  @P3 LDG.E.CI.64  load1B, [track1B];
+--:-:4:-:1  @P3 LDG.E.CI.64  load2B, [track2B];
+--:-:4:-:1  @P3 LDG.E.CI.64  load3B, [track3B];
+--:-:5:-:1  @P3 LDG.E.CI.64  load4B, [track4B];
+--:-:5:-:1  @P3 LDG.E.CI.64  load5B, [track5B];
+--:-:6:-:1  @P3 LDG.E.CI.64  load6B, [track6B];
+--:-:6:-:1  @P3 LDG.E.CI.64  load7B, [track7B];
+    } : q{
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>];
+    };
++]
+
+LOOP:
+
+[+
+    our $vec;
+    our %insert =
+    (
+        j0c8   => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, RZ, PT;\n",
+        j0c10  => "--:-:-:-:1      ISETP.GE.AND P1, PT, k, 64, PT;\n" .
+                  "--:-:-:-:1      IADD k, k, -64;\n",
+
+        j0c23  => "--:-:-:-:1  \@P1 R2P PR, preds, 0x7c;\n",
+        j0c24  => "--:-:-:-:1 \@!P1 R2P PR, RZ,    0x7c;\n",
+
+        j2c32  => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<64>;\n",
+        j2c37  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+
+        j3c25  => "--:-:-:-:1  \@P3 IADD   track0B0.CC, track0B0, ldb64;\n",
+        j3c30  => "--:-:-:-:1  \@P3 IADD.X track0B1,    track0B1, RZ;\n",
+        j3c32  => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb64;\n",
+        j3c37  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+
+        j4c25  => "--:-:-:-:1  \@P3 IADD   track2B0.CC, track2B0, ldb64;\n",
+        j4c30  => "--:-:-:-:1  \@P3 IADD.X track2B1,    track2B1, RZ;\n",
+        j4c32  => "--:-:-:-:1  \@P3 IADD   track3B0.CC, track3B0, ldb64;\n",
+        j4c37  => "--:-:-:-:1  \@P3 IADD.X track3B1,    track3B1, RZ;\n",
+
+        j5c25  => "--:-:-:-:1  \@P3 IADD   track4B0.CC, track4B0, ldb64;\n",
+        j5c30  => "--:-:-:-:1  \@P3 IADD.X track4B1,    track4B1, RZ;\n",
+        j5c32  => "--:-:-:-:1  \@P3 IADD   track5B0.CC, track5B0, ldb64;\n",
+        j5c37  => "--:-:-:-:1  \@P3 IADD.X track5B1,    track5B1, RZ;\n",
+
+        j6c25  => "--:-:-:-:1  \@P3 IADD   track6B0.CC, track6B0, ldb64;\n",
+        j6c30  => "--:-:-:-:1  \@P3 IADD.X track6B1,    track6B1, RZ;\n",
+        j6c32  => "--:-:-:-:1  \@P3 IADD   track7B0.CC, track7B0, ldb64;\n",
+        j6c37  => "--:-:-:-:1  \@P3 IADD.X track7B1,    track7B1, RZ;\n",
+
+        j6c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j2c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<7*16>], load0A7;\n",
+        j2c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*16>], load0A6;\n",
+        j2c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*16>], load0A5;\n",
+        j2c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*16>], load0A4;\n",
+        j2c24  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*16>], load0A3;\n",
+        j2c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*16>], load0A2;\n",
+        j2c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*16>], load0A1;\n",
+        j2c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<0*16>], load0A0;\n",
+
+        j3c16  => "04:-:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*64>], load0B;\n",
+        j3c20  => "--:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*64>], load1B;\n",
+
+        j4c16  => "08:-:-:-:1  \@P0 STS.128 [writeBs + 4x<16*64>], load2B;\n",
+        j4c20  => "--:4:-:-:1  \@P0 STS.128 [writeBs + 4x<24*64>], load3B;\n",
+
+        j5c16  => "10:-:-:-:1  \@P0 STS.128 [writeBs + 4x<32*64>], load4B;\n",
+        j5c20  => "--:5:-:-:1  \@P0 STS.128 [writeBs + 4x<40*64>], load5B;\n",
+
+        j6c16  => "20:-:-:-:1  \@P0 STS.128 [writeBs + 4x<48*64>], load6B;\n",
+        j6c20  => "--:6:-:-:1  \@P0 STS.128 [writeBs + 4x<56*64>], load7B;\n",
+
+        ($vec ?
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j1c63 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j2c36 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n",
+                j2c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j3c36 => "08:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j3c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j3c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j3c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n",
+                j3c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j4c36 => "10:-:-:-:1  \@P0 F2F.F32.F16 load4B3, load4B1.H1;\n",
+                j4c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B2, load4B1.H0;\n",
+                j4c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B1, load4B0.H1;\n",
+                j4c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B0, load4B0.H0;\n",
+                j4c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B3, load5B1.H1;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B2, load5B1.H0;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B1, load5B0.H1;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load5B0, load5B0.H0;\n",
+
+                j5c36 => "20:-:-:-:1  \@P0 F2F.F32.F16 load6B3, load6B1.H1;\n",
+                j5c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B2, load6B1.H0;\n",
+                j5c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B1, load6B0.H1;\n",
+                j5c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B0, load6B0.H0;\n",
+                j5c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B3, load7B1.H1;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B2, load7B1.H0;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B1, load7B0.H1;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load7B0, load7B0.H0;\n",
+
+                j2c61 => "02:-:2:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j3c60 => "04:-:-:-:1  \@P3 LDG.E.CI.64  load0B, [track0B];\n",
+                j3c62 => "--:-:3:-:1  \@P3 LDG.E.CI.64  load1B, [track1B];\n",
+                j4c60 => "08:-:-:-:1  \@P3 LDG.E.CI.64  load2B, [track2B];\n",
+                j4c62 => "--:-:4:-:1  \@P3 LDG.E.CI.64  load3B, [track3B];\n",
+                j5c60 => "10:-:-:-:1  \@P3 LDG.E.CI.64  load4B, [track4B];\n",
+                j5c62 => "--:-:5:-:1  \@P3 LDG.E.CI.64  load5B, [track5B];\n",
+                j6c60 => "20:-:-:-:1  \@P3 LDG.E.CI.64  load6B, [track6B];\n",
+                j6c62 => "--:-:6:-:1  \@P3 LDG.E.CI.64  load7B, [track7B];\n",
+            ) :
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A0, load0A0;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A1;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A2;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A3;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A4;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A5;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A6;\n",
+                j1c63 => "--:2:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A7;\n",
+
+                j2c36 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B3;\n",
+                j2c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B0, load1B0;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B1;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B2;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load1B3, load1B3;\n",
+
+                j3c36 => "08:-:-:-:1  \@P0 F2F.F32.F16 load2B0, load2B0;\n",
+                j3c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B1;\n",
+                j3c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B2;\n",
+                j3c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B3;\n",
+                j3c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B0, load3B0;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B1;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B2;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load3B3, load3B3;\n",
+
+                j4c36 => "10:-:-:-:1  \@P0 F2F.F32.F16 load4B0, load4B0;\n",
+                j4c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B1, load4B1;\n",
+                j4c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B2, load4B2;\n",
+                j4c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load4B3, load4B3;\n",
+                j4c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B0, load5B0;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B1, load5B1;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load5B2, load5B2;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load5B3, load5B3;\n",
+
+                j5c36 => "20:-:-:-:1  \@P0 F2F.F32.F16 load6B0, load6B0;\n",
+                j5c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B1, load6B1;\n",
+                j5c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B2, load6B2;\n",
+                j5c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load6B3, load6B3;\n",
+                j5c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B0, load7B0;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B1, load7B1;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load7B2, load7B2;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load7B3, load7B3;\n",
+
+                j2c48 => "02:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j2c50 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j2c52 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j2c54 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+                j2c56 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n",
+                j2c58 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n",
+                j2c60 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n",
+                j2c62 => "--:-:2:-:1  \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n",
+
+                j3c48 => "04:-:-:-:1  \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j3c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j3c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+                j3c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j3c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j3c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j3c62 => "--:-:3:-:1  \@P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+
+                j4c48 => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j4c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j4c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j4c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+                j4c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j4c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j4c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j4c62 => "--:-:4:-:1  \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+
+                j5c48 => "10:-:-:-:1  \@P3 LDG.E.CI.U16 load4B0, [track4B + 2x<0>];\n",
+                j5c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load4B1, [track4B + 2x<1>];\n",
+                j5c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load4B2, [track4B + 2x<2>];\n",
+                j5c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load4B3, [track4B + 2x<3>];\n",
+                j5c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load5B0, [track5B + 2x<0>];\n",
+                j5c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load5B1, [track5B + 2x<1>];\n",
+                j5c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load5B2, [track5B + 2x<2>];\n",
+                j5c62 => "--:-:5:-:1  \@P6 LDG.E.CI.U16 load5B3, [track5B + 2x<3>];\n",
+
+                j6c48 => "20:-:-:-:1  \@P3 LDG.E.CI.U16 load6B0, [track6B + 2x<0>];\n",
+                j6c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load6B1, [track6B + 2x<1>];\n",
+                j6c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load6B2, [track6B + 2x<2>];\n",
+                j6c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load6B3, [track6B + 2x<3>];\n",
+                j6c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load7B0, [track7B + 2x<0>];\n",
+                j6c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load7B1, [track7B + 2x<1>];\n",
+                j6c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load7B2, [track7B + 2x<2>];\n",
+                j6c62 => "--:-:6:-:1  \@P6 LDG.E.CI.U16 load7B3, [track7B + 2x<3>];\n",
+            )
+        ),
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out = '';
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*16 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*16 + 08>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4
+--:-:-:-:1      LOP.AND tid15, tid, 15;
+--:-:-:-:1      SHR.U32 tid16, tid, 4;
+--:-:-:-:1      SHL     tid15, tid15, 2;
+--:-:-:-:1      ISCADD readCs, tid16, tid15, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*64 + tid15;
+--:-:-:-:1      ISCADD cx, blkB, tid15, 6;
+--:-:-:-:1      IADD   cx1, cx, 1;
+--:-:-:-:1      IADD   cx2, cx, 2;
+--:-:-:-:1      IADD   cx3, cx, 3;
+
+// cy = blkA*16 + tid16
+--:-:-:-:1      ISCADD cy, blkA, tid16, 4;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      SHL  ldc8, ldc, 4;
+
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C1,    ci, param_C[1], RZ, 1;
+
+// P0 = cx < n
+--:-:-:-:1      ISETP.LT.AND P0, PT, cx,  param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cx1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cx2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cx3, param_n, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+// P4 = cy < m
+--:-:-:-:1      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+// P5 = beta != 0 && P4
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P4;
+
+// P6 = Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+
+// Init beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:1:-:1  @P0 LDG.E.64 loadC, [C];
+    } : q{
+--:-:-:-:0 @!P0 MOV loadC0, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>];
+--:-:-:-:0 @!P1 MOV loadC1, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>];
+--:-:-:-:0 @!P2 MOV loadC2, RZ;
+--:-:-:-:1  @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>];
+--:-:-:-:0 @!P3 MOV loadC3, RZ;
+--:-:1:-:1  @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>];
+    };
++]
+
+// Restore output preds
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+
+--:-:-:-:1      LDS.U.128 part0C, [readCs + 4x<0*8*64>];
+--:-:2:-:1      LDS.U.128 part1C, [readCs + 4x<1*8*64>];
+--:-:-:-:1      LDS.U.128 part2C, [readCs + 4x<2*8*64>];
+--:-:3:-:1      LDS.U.128 part3C, [readCs + 4x<3*8*64>];
+--:-:-:-:1      LDS.U.128 part4C, [readCs + 4x<4*8*64>];
+--:-:4:-:1      LDS.U.128 part5C, [readCs + 4x<5*8*64>];
+--:-:-:-:1      LDS.U.128 part6C, [readCs + 4x<6*8*64>];
+--:-:5:-:1      LDS.U.128 part7C, [readCs + 4x<7*8*64>];
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1  @P0 FADD part0C0, part0C0, part1C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part1C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part1C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part1C3;
+
+04:-:-:-:1  @P0 FADD part2C0, part2C0, part3C0;
+--:-:-:-:1  @P1 FADD part2C1, part2C1, part3C1;
+--:-:-:-:1  @P2 FADD part2C2, part2C2, part3C2;
+--:-:-:-:1  @P3 FADD part2C3, part2C3, part3C3;
+
+08:-:-:-:1  @P0 FADD part4C0, part4C0, part5C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part5C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part5C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part5C3;
+
+10:-:-:-:1  @P0 FADD part6C0, part6C0, part7C0;
+--:-:-:-:1  @P1 FADD part6C1, part6C1, part7C1;
+--:-:-:-:1  @P2 FADD part6C2, part6C2, part7C2;
+--:-:-:-:1  @P3 FADD part6C3, part6C3, part7C3;
+
+--:-:-:-:1  @P0 FADD part0C0, part0C0, part2C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part2C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part2C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part2C3;
+
+--:-:-:-:1  @P0 FADD part4C0, part4C0, part6C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part6C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part6C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part6C3;
+
+--:-:-:-:1  @P0 FADD c0, part0C0, part4C0;
+--:-:-:-:1  @P1 FADD c1, part0C1, part4C1;
+--:-:-:-:1  @P2 FADD c2, part0C2, part4C2;
+--:-:-:-:1  @P3 FADD c3, part0C3, part4C3;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD cy, cy, 8;
+
+[+
+    our $vec;
+    return $vec ? q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0.H0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC0.H1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC1.H0;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC1.H1;
+    } : q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC2;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC3;
+    };
++]
+
+01:-:-:-:1  @P5 FFMA c0, b0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, b1, beta, c1;
+04:-:-:-:1  @P5 FFMA c2, b2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, b3, beta, c3;
+
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, cy, param_m, P5;
+
+--:-:1:-:1  @P0 F2F.F16.F32 c0, c0;
+--:-:2:-:1  @P1 F2F.F16.F32 c1, c1;
+
+--:-:-:-:0      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+--:-:3:-:1  @P2 F2F.F16.F32 c2, c2;
+--:-:4:-:1  @P3 F2F.F16.F32 c3, c3;
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:2  @P0 BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2  @P0 BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:1  @P0 STG.E.CG.64 [C], c;
+    } : q{
+01:-:-:-:1  @P0 STG.E.U16 [C + 2x<0>], c0;
+02:-:-:-:1  @P1 STG.E.U16 [C + 2x<1>], c1;
+04:-:-:-:1  @P2 STG.E.U16 [C + 2x<2>], c2;
+08:1:-:-:1  @P3 STG.E.U16 [C + 2x<3>], c3;
+    };
++]
+
+// Restore beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+01:-:-:-:6      IADD   C0.CC, C0, ldc8;
+--:-:-:-:0      IADD.X C1,    C1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_32x128.sass b/Kernel/SGEMM/Pascal/hgemm_nn_32x128.sass
new file mode 100644
index 0000000..8c4510d
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nn_32x128.sass
@@ -0,0 +1,562 @@
+# Kernel: hgemm_nn_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*16*2 + (32*16 + 32)*2>
+    szShareA : (32*16 + 32)
+    szShareB : (128*16)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ tidAX, tidBX, lda, ldb, ldb4, ldaz, ldbz, tid1, tid3, tid96, ta, tb0, tb1, tb2, tb3, xmad_ta, xmad_tb, shiftAX, tidAY<1-3>, tidBY<1-3>, txb<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadA<0-3>
+      84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>
+
+    100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+--:-:3:-:1      S2R blkA, SR_CTAID.Y;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb4,  ldb, 2;
+--:-:-:-:1      SHL ldb16, ldb, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidBX, tid,   31;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   5;
+
+// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 2
+04:-:-:-:1      ISCADD   txa, blkA, tidAX, 5;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 1;
+
+// trackB += (blkB*128 + tidBX + ldb*tidBY) * 4
+02:-:-:-:1      ISCADD   txb, blkB, tidBX, 7;
+--:-:-:-:1      XMAD.LO2 tb0, ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb0, ldbz, blkZ,  tb0;
+--:-:-:-:1      IADD     tb1, tb0, ldb4;
+--:-:-:-:1      IADD     tb2, tb1, ldb4;
+--:-:-:-:1      IADD     tb3, tb2, ldb4;
+
+--:-:-:-:1      LEA      track0B0.CC, tb0, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb0, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track1B0.CC, tb1, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb1, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track2B0.CC, tb2, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb2, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track3B0.CC, tb3, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb3, param_B[1], RZ, 1;
+
+// writeAs = (tidAY*32 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*128 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 7;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+01:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readBs, readBs, tid96;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidBY1, tidBY, 4;
+--:-:-:-:1      IADD tidBY2, tidBY, 8;
+--:-:-:-:1      IADD tidBY3, tidBY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidBY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.64 load0B, [track0B];
+--:-:2:-:1  @P1 LDG.E.CI.64 load1B, [track1B];
+--:-:3:-:1  @P2 LDG.E.CI.64 load2B, [track2B];
+--:-:4:-:1  @P3 LDG.E.CI.64 load3B, [track3B];
+--:-:5:-:1  @P4 LDG.E.CI.64 loadA,  [trackA];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.64 load0B, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.64 load1B, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.64 load2B, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 load3B, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.64 loadA,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:1:-:1  @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:1:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:1:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];
+--:-:5:-:1  @P1 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];
+--:-:5:-:1  @P2 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P0, RZ, k, 15;
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 16, P0;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+21:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:1:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+
+02:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+
+04:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:3:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:4:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+
+10:-:-:-:1      F2F.F32.F16 loadA3, loadA1.H1;
+--:-:-:-:1      F2F.F32.F16 loadA2, loadA1.H0;
+--:-:-:-:1      F2F.F32.F16 loadA1, loadA0.H1;
+--:-:5:-:1      F2F.F32.F16 loadA0, loadA0.H0;
+    } : q{
+21:-:-:-:1      F2F.F32.F16 load0B0, load0B0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:1:-:1      F2F.F32.F16 load0B3, load0B3;
+
+02:-:-:-:1      F2F.F32.F16 load1B0, load1B0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:2:-:1      F2F.F32.F16 load1B3, load1B3;
+
+04:-:-:-:1      F2F.F32.F16 load2B0, load2B0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:3:-:1      F2F.F32.F16 load2B3, load2B3;
+
+08:-:-:-:1      F2F.F32.F16 load3B0, load3B0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:4:-:1      F2F.F32.F16 load3B3, load3B3;
+
+10:-:-:-:1      F2F.F32.F16 loadA0, loadA0;
+--:-:-:-:1      F2F.F32.F16 loadA1, loadA1;
+--:-:-:-:1      F2F.F32.F16 loadA2, loadA2;
+--:-:5:-:1      F2F.F32.F16 loadA3, loadA3;
+    };
+</CODE>
+
+01:-:-:-:1      STS.128 [writeBs + 4x<0*128>], load0B;
+--:-:-:-:6      IADD   track0B0.CC, track0B0, ldb16;
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+02:-:-:-:1      STS.128 [writeBs + 4x<4*128>], load1B;
+--:-:-:-:6      IADD   track1B0.CC, track1B0, ldb16;
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+04:-:-:-:1      STS.128 [writeBs + 4x<8*128>], load2B;
+--:-:-:-:6      IADD   track2B0.CC, track2B0, ldb16;
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs + 4x<12*128>], load3B;
+--:-:-:-:6      IADD   track3B0.CC, track3B0, ldb16;
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+10:-:-:-:1      STS [writeAs + 4x<0*32>], loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*32>], loadA3;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.64 load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.64 load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.64 load2B, [track2B];
+--:-:5:-:1  @P5 LDG.E.CI.64 load3B, [track3B];
+--:-:6:-:1  @P6 LDG.E.CI.64 loadA,  [trackA];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*128>], load0B;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeBs + 4x< 4*128>], load1B;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*128>], load2B;\n",
+        j9c6   => "10:5:-:-:1  \@P0 STS.128 [writeBs + 4x<12*128>], load3B;\n",
+        j11c6  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<3*32>], loadA3;\n",
+        j11c8  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], loadA2;\n",
+        j11c10 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], loadA1;\n",
+        j11c12 => "--:6:-:-:1  \@P0 STS [writeAs + 4x<0*32>], loadA0;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0B0.CC, track0B0, ldb16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0B1,    track0B1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2B0.CC, track2B0, ldb16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2B1,    track2B1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3B0.CC, track3B0, ldb16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3B1,    track3B1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackA0.CC,  trackA0, 2x<16>;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackA1,     trackA1, RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.64 load0B, [track0B];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.64 load1B, [track1B];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.64 load2B, [track2B];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.64 load3B, [track3B];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.64 loadA,  [trackA];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j6c13  => "10:-:-:-:1  \@P5 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j6c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j6c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j6c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load2B0, load2B0.H0;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadA3, loadA1.H1;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA2, loadA1.H0;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA1, loadA0.H1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA0, loadA0.H0;\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0B3, load0B3;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1B0, load1B0;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B1, load1B1;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B2, load1B2;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1B3, load1B3;\n",
+
+                j6c13  => "10:-:-:-:1  \@P5 F2F.F32.F16 load2B0, load2B0;\n",
+                j6c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2B1, load2B1;\n",
+                j6c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2B2, load2B2;\n",
+                j6c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load2B3, load2B3;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B0, load3B0;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B1, load3B1;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B2, load3B2;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3B3, load3B3;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadA3, loadA3;\n",
+                j10c17 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA2, loadA2;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA1, loadA1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA0, loadA0;\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="hgemm_common_32x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/hgemm_nn_32x64.sass b/Kernel/SGEMM/Pascal/hgemm_nn_32x64.sass
new file mode 100644
index 0000000..56b813f
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nn_32x64.sass
@@ -0,0 +1,913 @@
+# Kernel: hgemm_nn_32x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<32*33*2 + 64*32*2>
+    szShareA   : (32*33)
+    szShareB   : (64*32)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+      64-79 : j0Ay<0-7>, j0Bx<0-7>
+      80-95 : j1Ay<0-7>, j1Bx<0-7>
+
+      64-95 ~ lda, ldb, ldb8, tidAX, tidAY, tidBX, tidBY, tidAY<1-3>, tidBY<8|16|24>, tid1, tid32, tb, shiftAX, partialK, partialB, ldaz, ldbz, ta, txa, txb, txb<1-3>, xmad_ta, xmad_tb
+
+     96-119 :  load0A<0-7>,  load0B<0-3>,  load1B<0-3>,  load2B<0-3>,  load3B<0-3>
+    120-129 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    130-137 ~ swapBuf, readAs, readBs, writeAs, writeBs, k, ldb32
+    138-144 ~ tid, blkA, blkB, blkZ, writeCs, preds
+
+       0-15 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+      96-99 : loadC<0-3>
+    100-103 : b<0-3>
+    104-107 : c<0-3>
+    108-109 : C<0-1>
+    110-137 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15, tid16
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb8,  ldb, 3;
+--:-:-:-:1      SHL ldb32, ldb, 6;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 3
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,   2;
+--:-:-:-:1      LOP.AND tidAY,   tid,   3;
+--:-:-:-:1      SHL     shiftAX, tidAY, 3;
+--:-:-:-:1      SHL     tidAY,   tidAY, 3;
+
+// tidBX   = (tid & 15) << 2
+// tidBY   = tid >> 4
+01:-:-:-:1      LOP.AND tidBX, tid,  15;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   4;
+
+--:-:-:-:1      IADD tidBY8,  tidBY, 8;
+--:-:-:-:1      IADD tidBY16, tidBY, 16;
+--:-:-:-:1      IADD tidBY24, tidBY, 24;
+
+// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 5;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa, tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa, param_m, PT;
+
+// trackB += (blkB*64 + tidBX + ldb*tidBY) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
+--:-:-:-:1      XMAD.LO2 tb, ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb, ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      track0B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb,  tb,  ldb8;
+--:-:-:-:1      LEA      track1B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb,  tb,  ldb8;
+--:-:-:-:1      LEA      track2B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb, param_B[1], RZ, 1;
+--:-:-:-:1      IADD     tb,  tb,  ldb8;
+--:-:-:-:1      LEA      track3B0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb, param_B[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb, param_n, PT;
+[+
+    our $vec;
+    return $vec ? '' : q{
+--:-:-:-:1      IADD txb1, txb, 1;
+--:-:-:-:1      IADD txb2, txb, 2;
+--:-:-:-:1      IADD txb3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb3, param_n, PT;
+    };
++]
+--:-:-:-:1      P2R preds, PR, RZ, 0x7c;
+
+// writeAs = (tidAY*32 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (tidBY*64 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA>, 2;
+
+// readAs = (((tid & 16) >> 2) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,   16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 2 bits at position 1
+--:-:-:-:1      SHL     readBs, readBs, 4;
+
+// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5)
+// tid32 = tid & -32
+--:-:-:-:1      LOP.AND tid32, tid, -32;
+
+// Write out the 4 groups of 32 rows 16 at a time
+// writeCs = (readAs + tid32/2*4) * 64 + readBs
+--:-:-:-:1      ISCADD writeCs, tid32,   readAs, 1;
+--:-:-:-:1      ISCADD writeCs, writeCs, readBs, 6;
+
+// Each block of 32 threads works on 8 lines,
+// readAs is also shifted over by 8 for each group of 32 threads
+// readAs += tid32/4 * 32 * 4 + tid32/4 * 4
+// readBs += tid32/4 * 64 * 4 + 4x<szShareA>
+--:-:-:-:1      ISCADD readAs, tid32,  readAs, 5;
+--:-:-:-:1      ISCADD readBs, tid32,  readBs, 6;
+--:-:-:-:1      IADD   readAs, tid32,  readAs;
+--:-:-:-:1      IADD   readBs, readBs, 4x<szShareA>;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareA + szShareB>;
+
+// If k is not a multiple of 32 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 32 then make a full 32 line fetch.
+--:-:-:-:1      LOP.AND.Z P0, partialK, k, 31;
+--:-:-:-:1  @P0 MOV partialK, 32;
+--:-:-:-:1      IADD k, k, -partialK;
+[+
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY,   partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY8,  partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY16, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY24, partialK, P3;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidBY,   partialK, P3;
+<ORDERED>
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.64  load0B, [track0B];
+--:-:4:-:1  @P4 LDG.E.CI.64  load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.64  load2B, [track2B];
+--:-:6:-:1  @P6 LDG.E.CI.64  load3B, [track3B];
+</ORDERED>
+<ORDERED>
+--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.64  load0B, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.64  load1B, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.64  load2B, [addr_zero];
+--:-:1:-:1 @!P6 LDS.U.64  load3B, [addr_zero];
+</ORDERED>
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:2:-:1  @P6 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0A0, RZ;
+--:-:-:-:1 @!P4 MOV load0A1, RZ;
+--:-:-:-:1 @!P5 MOV load0A2, RZ;
+--:-:-:-:1 @!P6 MOV load0A3, RZ;
+
+--:-:-:-:1      IADD tidAY,  tidAY,  4;
+--:-:-:-:1      IADD tidAY1, tidAY1, 4;
+--:-:-:-:1      IADD tidAY2, tidAY2, 4;
+--:-:-:-:1      IADD tidAY3, tidAY3, 4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P6 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0A4, RZ;
+--:-:-:-:1 @!P4 MOV load0A5, RZ;
+--:-:-:-:1 @!P5 MOV load0A6, RZ;
+--:-:-:-:1 @!P6 MOV load0A7, RZ;
+
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load0B0, RZ;
+--:-:-:-:1 @!P4 MOV load0B1, RZ;
+--:-:-:-:1 @!P5 MOV load0B2, RZ;
+--:-:-:-:1 @!P6 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY8, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P1 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load1B0, RZ;
+--:-:-:-:1 @!P4 MOV load1B1, RZ;
+--:-:-:-:1 @!P5 MOV load1B2, RZ;
+--:-:-:-:1 @!P6 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY16, partialK, PT;
+--:-:-:-:1  @P2 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P2 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load2B0, RZ;
+--:-:-:-:1 @!P4 MOV load2B1, RZ;
+--:-:-:-:1 @!P5 MOV load2B2, RZ;
+--:-:-:-:1 @!P6 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY24, partialK, PT;
+--:-:-:-:1  @P0 R2P PR, preds, 0x78;
+--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
+<ORDERED>
+--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P3 MOV load3B0, RZ;
+--:-:-:-:1 @!P4 MOV load3B1, RZ;
+--:-:-:-:1 @!P5 MOV load3B2, RZ;
+--:-:-:-:1 @!P6 MOV load3B3, RZ;
+
+    };
++]
+// partialB = partialK * ldb
+--:-:-:-:1      XMAD.LO2 partialB, ldb, partialK, RZ;
+
+--:-:-:-:1      ISETP.GE.AND P1, PT, k, 32, PT;
+--:-:-:-:1      IADD k, k, -32;
+--:-:-:-:1  @P1 R2P PR, preds, 0x7c;
+--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c;
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:1      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+    } : q{
+02:-:-:-:1      F2F.F32.F16 load0A7, load0A7;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A6;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A5;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A4;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A3;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0;
+    };
++]
+--:-:-:-:0      LEA track0A0.CC, partialK, track0A0, 1;
+01:-:-:-:1      STS [writeAs + 4x<7*32>], load0A7;
+--:-:-:-:1      STS [writeAs + 4x<6*32>], load0A6;
+--:-:-:-:1      STS [writeAs + 4x<5*32>], load0A5;
+--:-:-:-:1      STS [writeAs + 4x<4*32>], load0A4;
+02:-:-:-:1      STS [writeAs + 4x<3*32>], load0A3;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], load0A2;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<0*32>], load0A0;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+04:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:3:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:4:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+
+10:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:5:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+
+20:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:6:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+    } : q{
+04:-:-:-:1      F2F.F32.F16 load0B0, load0B0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:3:-:1      F2F.F32.F16 load0B3, load0B3;
+
+08:-:-:-:1      F2F.F32.F16 load1B0, load1B0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:4:-:1      F2F.F32.F16 load1B3, load1B3;
+
+10:-:-:-:1      F2F.F32.F16 load2B0, load2B0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:5:-:1      F2F.F32.F16 load2B3, load2B3;
+
+20:-:-:-:1      F2F.F32.F16 load3B0, load3B0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:6:-:1      F2F.F32.F16 load3B3, load3B3;
+    };
++]
+
+--:-:-:-:0      LEA track0B0.CC, partialB, track0B0, 1;
+04:-:-:-:6      STS.128 [writeBs + 4x<0*64>], load0B;
+--:-:-:-:1      IADD.X track0B1, track0B1, RZ;
+
+--:-:-:-:0      LEA track1B0.CC, partialB, track1B0, 1;
+08:-:-:-:6      STS.128 [writeBs + 4x<8*64>], load1B;
+--:-:-:-:1      IADD.X track1B1, track1B1, RZ;
+
+--:-:-:-:0      LEA track2B0.CC, partialB, track2B0, 1;
+10:-:-:-:6      STS.128 [writeBs + 4x<16*64>], load2B;
+--:-:-:-:1      IADD.X track2B1, track2B1, RZ;
+
+--:-:-:-:0      LEA track3B0.CC, partialB, track3B0, 1;
+20:-:-:-:6      STS.128 [writeBs + 4x<24*64>], load3B;
+--:-:-:-:0      IADD.X track3B1, track3B1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>];
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.64  load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.64  load1B, [track1B];
+--:-:5:-:1  @P3 LDG.E.CI.64  load2B, [track2B];
+--:-:6:-:1  @P3 LDG.E.CI.64  load3B, [track3B];
+    } : q{
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+    };
++]
+
+LOOP:
+
+[+
+    our $vec;
+    our %insert =
+    (
+        j0c8   => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, RZ, PT;\n",
+        j0c10  => "--:-:-:-:1      ISETP.GE.AND P1, PT, k, 32, PT;\n" .
+                  "--:-:-:-:1      IADD k, k, -32;\n",
+
+        j0c23  => "--:-:-:-:1  \@P1 R2P PR, preds, 0x7c;\n",
+        j0c24  => "--:-:-:-:1 \@!P1 R2P PR, RZ,    0x7c;\n",
+
+        j2c32  => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<32>;\n",
+        j2c37  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j3c32  => "--:-:-:-:1  \@P3 IADD   track0B0.CC, track0B0, ldb32;\n",
+        j3c37  => "--:-:-:-:1  \@P3 IADD.X track0B1,    track0B1, RZ;\n",
+        j4c32  => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb32;\n",
+        j4c37  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j5c32  => "--:-:-:-:1  \@P3 IADD   track2B0.CC, track2B0, ldb32;\n",
+        j5c37  => "--:-:-:-:1  \@P3 IADD.X track2B1,    track2B1, RZ;\n",
+        j6c32  => "--:-:-:-:1  \@P3 IADD   track3B0.CC, track3B0, ldb32;\n",
+        j6c37  => "--:-:-:-:1  \@P3 IADD.X track3B1,    track3B1, RZ;\n",
+
+        j6c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j2c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<7*32>], load0A7;\n",
+        j2c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32>], load0A6;\n",
+        j2c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32>], load0A5;\n",
+        j2c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*32>], load0A4;\n",
+        j2c24  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*32>], load0A3;\n",
+        j2c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], load0A2;\n",
+        j2c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], load0A1;\n",
+        j2c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<0*32>], load0A0;\n",
+
+        j3c16  => "04:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*64>], load0B;\n",
+        j4c16  => "08:4:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*64>], load1B;\n",
+        j5c16  => "10:5:-:-:1  \@P0 STS.128 [writeBs + 4x<16*64>], load2B;\n",
+        j6c16  => "20:6:-:-:1  \@P0 STS.128 [writeBs + 4x<24*64>], load3B;\n",
+
+        ($vec ?
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j1c63 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j2c51 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j3c51 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j4c51 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n",
+
+                j5c51 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j2c61 => "02:-:2:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j3c61 => "04:-:3:-:1  \@P3 LDG.E.CI.64  load0B, [track0B];\n",
+                j4c61 => "08:-:4:-:1  \@P3 LDG.E.CI.64  load1B, [track1B];\n",
+                j5c61 => "10:-:5:-:1  \@P3 LDG.E.CI.64  load2B, [track2B];\n",
+                j6c61 => "20:-:6:-:1  \@P3 LDG.E.CI.64  load3B, [track3B];\n",
+            ) :
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A0, load0A0;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A1;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A2;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A3;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A4;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A5;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A6;\n",
+                j1c63 => "--:2:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A7;\n",
+
+                j2c51 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B3, load0B3;\n",
+
+                j3c51 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B0, load1B0;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B1;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B2;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B3, load1B3;\n",
+
+                j4c51 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B0, load2B0;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B1;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B2;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B3, load2B3;\n",
+
+                j5c51 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B0, load3B0;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B1;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B2;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B3, load3B3;\n",
+
+                j2c48 => "02:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j2c50 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j2c52 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j2c54 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+                j2c56 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n",
+                j2c58 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n",
+                j2c60 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n",
+                j2c62 => "--:-:2:-:1  \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n",
+
+                j3c56 => "04:-:-:-:1  \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j3c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j3c62 => "--:-:3:-:1  \@P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+
+                j4c56 => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j4c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j4c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j4c62 => "--:-:4:-:1  \@P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+
+                j5c56 => "10:-:-:-:1  \@P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j5c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j5c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j5c62 => "--:-:5:-:1  \@P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+
+                j6c56 => "20:-:-:-:1  \@P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j6c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j6c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j6c62 => "--:-:6:-:1  \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+            )
+        ),
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out = '';
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4
+--:-:-:-:1      LOP.AND tid15, tid,   15;
+--:-:-:-:1      SHR.U32 tid16, tid,    4;
+--:-:-:-:1      SHL     tid15, tid15,  2;
+--:-:-:-:1      ISCADD readCs, tid16,  tid15, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*64 + tid15;
+--:-:-:-:1      ISCADD cx, blkB, tid15, 6;
+--:-:-:-:1      IADD   cx1, cx, 1;
+--:-:-:-:1      IADD   cx2, cx, 2;
+--:-:-:-:1      IADD   cx3, cx, 3;
+
+// cy = blkA*32 + tid16
+--:-:-:-:1      ISCADD cy, blkA, tid16, 5;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      SHL  ldc8, ldc, 4;
+
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C1,    ci, param_C[1], RZ, 1;
+
+// P0 = cx < n
+--:-:-:-:1      ISETP.LT.AND P0, PT, cx,  param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cx1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cx2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cx3, param_n, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+// P4 = cy < m
+--:-:-:-:1      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+// P5 = beta != 0 && P4
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P4;
+
+// P6 = Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+
+// Init beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:1:-:1  @P0 LDG.E.64 loadC, [C];
+    } : q{
+--:-:-:-:0 @!P0 MOV loadC0, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>];
+--:-:-:-:0 @!P1 MOV loadC1, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>];
+--:-:-:-:0 @!P2 MOV loadC2, RZ;
+--:-:-:-:1  @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>];
+--:-:-:-:0 @!P3 MOV loadC3, RZ;
+--:-:1:-:1  @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>];
+    };
++]
+
+// Restore output preds
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+
+--:-:-:-:1      LDS.U.128 part0C, [readCs + 4x< 0*64>];
+--:-:2:-:1      LDS.U.128 part1C, [readCs + 4x<16*64>];
+--:-:-:-:1      LDS.U.128 part2C, [readCs + 4x<32*64>];
+--:-:3:-:1      LDS.U.128 part3C, [readCs + 4x<48*64>];
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1  @P0 FADD part0C0, part0C0, part1C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part1C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part1C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part1C3;
+
+04:-:-:-:1  @P0 FADD part2C0, part2C0, part3C0;
+--:-:-:-:1  @P1 FADD part2C1, part2C1, part3C1;
+--:-:-:-:1  @P2 FADD part2C2, part2C2, part3C2;
+--:-:-:-:1  @P3 FADD part2C3, part2C3, part3C3;
+
+--:-:-:-:1  @P0 FADD c0, part0C0, part2C0;
+--:-:-:-:1  @P1 FADD c1, part0C1, part2C1;
+--:-:-:-:1  @P2 FADD c2, part0C2, part2C2;
+--:-:-:-:1  @P3 FADD c3, part0C3, part2C3;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD cy, cy, 8;
+
+[+
+    our $vec;
+    return $vec ? q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0.H0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC0.H1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC1.H0;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC1.H1;
+    } : q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC2;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC3;
+    };
++]
+
+01:-:-:-:1  @P5 FFMA c0, b0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, b1, beta, c1;
+04:-:-:-:1  @P5 FFMA c2, b2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, b3, beta, c3;
+
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, cy, param_m, P5;
+
+--:-:1:-:1  @P0 F2F.F16.F32 c0, c0;
+--:-:2:-:1  @P1 F2F.F16.F32 c1, c1;
+
+--:-:-:-:0      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+--:-:3:-:1  @P2 F2F.F16.F32 c2, c2;
+
+--:-:-:-:0      LOP.XOR readCs, readCs, 4x<8*64>;
+
+--:-:4:-:1  @P3 F2F.F16.F32 c3, c3;
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:2  @P0 BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2  @P0 BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:1  @P0 STG.E.CG.64 [C], c;
+    } : q{
+01:-:-:-:1  @P0 STG.E.U16 [C + 2x<0>], c0;
+02:-:-:-:1  @P1 STG.E.U16 [C + 2x<1>], c1;
+04:-:-:-:1  @P2 STG.E.U16 [C + 2x<2>], c2;
+08:1:-:-:1  @P3 STG.E.U16 [C + 2x<3>], c3;
+    };
++]
+
+// Restore beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+01:-:-:-:6      IADD   C0.CC, C0, ldc8;
+--:-:-:-:0      IADD.X C1,    C1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/hgemm_nt_128x128.sass b/Kernel/SGEMM/Pascal/hgemm_nt_128x128.sass
new file mode 100644
index 0000000..29a50f0
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nt_128x128.sass
@@ -0,0 +1,400 @@
+# Kernel: hgemm_nt_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[-
+
+our $int16;
+
+our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+
+sub convert_in {return $convert;}
+
+
+sub int16_params {
+    return $int16 ? q{
+param_Stats[0]  : c[0x0][0x190]
+param_Stats[1]  : c[0x0][0x194]
+param_scale     : c[0x0][0x198]
+    } : "";
+}
+-]
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+    [+ int16_params() +]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid7, tid128, tid127, txa, txb, xmad_ta, xmad_tb, k1, k2, k3
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-107  : loadA<0-5>, loadB<0-5>
+
+    108-111 : trackA<0-1>, trackB<0-1>
+
+    112-118 ~ writeS, k, tidY, ta, tb, loop
+    119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ
+
+    64-75   ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128
+
+    64-79   : c<0-7>, d3, d2, d1, d0, cs<0-3>
+    64-65   : Stats<0-1>
+    80-89   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    90-118  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,  param_k;
+--:-:-:-:1      LOP.AND tid1, tid,  1;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+--:-:-:-:1      MOV loop, RZ;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
++]
+
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// tidY = tid1 << 2
+--:-:-:-:1      SHL tidY, tid1, 2;
+
+// tidX = tid >> 1
+01:-:-:-:1      SHR.U32 tidX,  tid,   1;
+
+// trackA += 2 * ((blkA*128 + tidX) * lda + tidY)
+02:-:-:-:1      ISCADD  txa, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa,  tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x1;
+
+// trackB += 2 * ((blkB*128 + tidX) * ldb + tidY)
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 7;
+--:-:-:-:1      XMAD.LO  tb,  ldb,  txb,  tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = 4 * (128 * tidY + tidX)
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      ISCADD  writeS, writeS, 4x<128*8*2>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+[+
+    our $vec;
+    return $vec ? q{
+// k must be multiple of 8
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+--:-:2:-:1  @P5 LDG.E.CI.64 loadA0, [trackA + 2x<0>];
+--:-:2:-:1  @P5 LDG.E.CI.64 loadA4, [trackA + 2x<8>];
+--:-:4:-:1  @P6 LDG.E.CI.64 loadB0, [trackB + 2x<0>];
+--:5:6:-:1  @P6 LDG.E.CI.64 loadB4, [trackB + 2x<8>];
+
+--:-:3:-:1 @!P5 LDS.U.64    loadA0, [addr_zero];
+--:-:3:-:1 @!P5 LDS.U.64    loadA4, [addr_zero];
+--:-:3:-:1 @!P6 LDS.U.64    loadB0, [addr_zero];
+--:-:3:-:2 @!P6 LDS.U.64    loadB4, [addr_zero];
+
+    // Vec 4 and scalar loads
+    } : q{
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD k1, tidY, 1;
+--:-:-:-:1      IADD k2, tidY, 2;
+--:-:-:-:1      IADD k3, tidY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P6;
+
+--:-:4:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:1      LOP.AND.NZ P4, RZ, k, 7;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 8, P4;
+</SCHEDULE_BLOCK>
+    };
++]
+
+[+
+    our $vec;
+    our $convert;
+    return $vec ? qq{
+
+06:-:1:-:4      $convert loadA3, loadA1.H1;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<16>;
+--:-:2:-:4      $convert loadA2, loadA1.H0;
+--:-:-:-:4      $convert loadA1, loadA0.H1;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+--:-:3:-:1      $convert loadA0, loadA0.H0;
+
+01:-:-:-:1      STS [writeS + 4x<3*128>], loadA3;
+02:-:-:-:1      STS [writeS + 4x<2*128>], loadA2;
+04:-:-:-:1      STS [writeS + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeS + 4x<0*128>], loadA0;
+
+08:-:1:-:4      $convert loadB3, loadB1.H1;
+10:-:-:-:0      IADD   trackB0.CC, trackB0, 2x<16>;
+--:-:2:-:4      $convert loadB2, loadB1.H0;
+--:-:3:-:4      $convert loadB1, loadB0.H1;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+--:-:4:-:1      $convert loadB0, loadB0.H0;
+
+01:-:-:-:1      STS [writeS + 4x<11*128>], loadB3;
+02:-:-:-:1      STS [writeS + 4x<10*128>], loadB2;
+04:-:-:-:1      STS [writeS + 4x< 9*128>], loadB1;
+08:-:-:-:1      STS [writeS + 4x< 8*128>], loadB0;
+
+    // scalar loads
+    } : qq{
+
+02:-:-:-:4      $convert loadA0, loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<8>;
+--:-:2:-:4      $convert loadA1, loadA1;
+--:-:-:-:4      $convert loadA2, loadA2;
+--:-:3:-:1      $convert loadA3, loadA3;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS [writeS + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeS + 4x<1*128>], loadA1;
+04:-:-:-:1      STS [writeS + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeS + 4x<3*128>], loadA3;
+
+08:-:-:-:4      $convert loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, 2x<8>;
+--:-:2:-:4      $convert loadB1, loadB1;
+--:-:-:-:4      $convert loadB2, loadB2;
+--:-:3:-:1      $convert loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+02:-:-:-:1      STS [writeS + 4x< 8*128>], loadB0;
+--:-:-:-:1      STS [writeS + 4x< 9*128>], loadB1;
+04:-:-:-:1      STS [writeS + 4x<10*128>], loadB2;
+--:-:-:-:1      STS [writeS + 4x<11*128>], loadB3;
+    };
++]
+
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+
+[+
+    our $vec;
+    our $convert;
+    our @top = $vec ?
+        ("--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n") :
+        ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, 16, P5;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+        j0c13 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P5;\n",
+        j0c15 => "--:-:-:-:1      PSETP.AND.AND P3, PT, P0, P1, P6;\n",
+
+        j0c27 => "--:-:2:-:1  \@P2 LDG.E.CI.64 loadA0, [trackA + 2x<0>];\n",
+        j0c29 => "--:-:2:-:1  \@P2 LDG.E.CI.64 loadA4, [trackA + 2x<8>];\n",
+        j0c31 => "--:-:4:-:1  \@P3 LDG.E.CI.64 loadB0, [trackB + 2x<0>];\n",
+        j0c33 => "20:5:6:-:1  \@P3 LDG.E.CI.64 loadB4, [trackB + 2x<8>];\n",
+
+        j3c5  => "--:-:-:-:1 \@!P1 $convert loadA3, loadA5.H1;\n",
+        j3c9  => "--:-:-:-:1 \@!P1 $convert loadA2, loadA5.H0;\n",
+        j3c13 => "--:-:-:-:1 \@!P1 $convert loadA1, loadA4.H1;\n",
+        j3c17 => "--:-:-:-:1 \@!P1 $convert loadA0, loadA4.H0;\n",
+
+        j4c5  => "--:-:-:-:1 \@!P1 $convert loadB3, loadB5.H1;\n",
+        j4c9  => "--:-:-:-:1 \@!P1 $convert loadB2, loadB5.H0;\n",
+        j4c13 => "--:-:-:-:1 \@!P1 $convert loadB1, loadB4.H1;\n",
+        j4c17 => "--:-:-:-:1 \@!P1 $convert loadB0, loadB4.H0;\n",
+
+        j5c5  => "02:-:-:-:1  \@P1 $convert loadA3, loadA1.H1;\n",
+        j5c9  => "--:-:2:-:1  \@P1 $convert loadA2, loadA1.H0;\n",
+        j5c13 => "--:-:-:-:1  \@P1 $convert loadA1, loadA0.H1;\n",
+        j5c17 => "--:-:3:-:1  \@P1 $convert loadA0, loadA0.H0;\n",
+
+        j5c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x<3*128>], loadA3;\n",
+        j5c31 => "--:-:-:-:1  \@P0 STS [writeS + 4x<2*128>], loadA2;\n",
+        j5c33 => "04:-:-:-:1  \@P0 STS [writeS + 4x<1*128>], loadA1;\n",
+        j5c35 => "--:-:-:-:1  \@P0 STS [writeS + 4x<0*128>], loadA0;\n",
+
+        j6c5  => "08:-:-:-:1  \@P1 $convert loadB3, loadB1.H1;\n",
+        j6c9  => "--:-:2:-:1  \@P1 $convert loadB2, loadB1.H0;\n",
+        j6c13 => "--:-:3:-:1  \@P1 $convert loadB1, loadB0.H1;\n",
+        j6c17 => "--:-:4:-:1  \@P1 $convert loadB0, loadB0.H0;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x<11*128>], loadB3;\n",
+        j6c31 => "--:-:-:-:1  \@P0 STS [writeS + 4x<10*128>], loadB2;\n",
+        j6c33 => "04:-:-:-:1  \@P0 STS [writeS + 4x< 9*128>], loadB1;\n",
+        j6c35 => "08:-:-:-:1  \@P0 STS [writeS + 4x< 8*128>], loadB0;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 2x<16>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c46 => "10:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, 2x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+            ) :
+            (
+        j0c7  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 16, P6;\n",
+        j0c8  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n",
+
+        j0c29 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j0c31 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j0c33 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j0c35 => "--:-:6:-:1  \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j5c5  => "02:-:2:-:1  \@P2 $convert loadA0, loadA0;\n",
+        j5c9  => "--:-:3:-:1  \@P2 $convert loadA1, loadA1;\n",
+        j5c13 => "--:-:4:-:1  \@P2 $convert loadA2, loadA2;\n",
+        j5c17 => "--:-:5:-:1  \@P2 $convert loadA3, loadA3;\n",
+
+        j5c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x<0*128>], loadA0;\n",
+        j5c31 => "04:-:-:-:1  \@P0 STS [writeS + 4x<1*128>], loadA1;\n",
+        j5c33 => "08:-:-:-:1  \@P0 STS [writeS + 4x<2*128>], loadA2;\n",
+        j5c35 => "10:-:-:-:1  \@P0 STS [writeS + 4x<3*128>], loadA3;\n",
+
+        j6c5  => "20:-:2:-:1  \@P3 $convert loadB0, loadB0;\n",
+        j6c9  => "--:-:3:-:1  \@P3 $convert loadB1, loadB1;\n",
+        j6c13 => "--:-:4:-:1  \@P3 $convert loadB2, loadB2;\n",
+        j6c17 => "--:-:5:-:1  \@P3 $convert loadB3, loadB3;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x< 8*128>], loadB0;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS [writeS + 4x< 9*128>], loadB1;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS [writeS + 4x<10*128>], loadB2;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS [writeS + 4x<11*128>], loadB3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 2x<8>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c46 => "10:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, 2x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+            )
+        ),
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1      IADD32I k, k, -8;\n",
+    );
+    return;
++]
+
+<INCLUDE file="hgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/hgemm_nt_16x64.sass b/Kernel/SGEMM/Pascal/hgemm_nt_16x64.sass
new file mode 100644
index 0000000..ce5e6ef
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nt_16x64.sass
@@ -0,0 +1,1185 @@
+# Kernel: hgemm_nt_16x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(16*64 + 32)*2 + (64*64 + 32)*2>
+    szShareA   : (16*64 + 32)
+    szShareB   : (64*64 + 32)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+      64-79 : j0Ay<0-7>, j0Bx<0-7>
+      80-95 : j1Ay<0-7>, j1Bx<0-7>
+
+      64-95 ~ tidX, tidY, tidY<1-3>, lda, ldb, ldaz, ldbz, ldb16, tid16_8, ta, txa, tb<00|16|32|48>, txb<00|16|32|48>, xmad_ta, xmad_tb, shiftX, predsY0, predsY4, partialK
+
+     96-135 :  load0A<0-7>,  load0B<0-7>,  load1B<0-7>,  load2B<0-7>,  load3B<0-7>
+    136-145 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    146-152 ~ swapBuf, readAs, readBs, writeAs, writeBs, k
+    153-159 ~ tid, blkA, blkB, blkZ, writeCs, preds, tid16
+
+       0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3>
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+      96-99 : loadC<0-3>
+    100-103 : b<0-3>
+    104-107 : c<0-3>
+    108-109 : C<0-1>
+    110-152 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb16, ldb, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// tidX   = tid >> 3
+// tidY   = (tid & 7) << 3
+// shiftX = (tid & 7) << 2
+01:-:-:-:1      SHR.U32 tidX, tid,  3;
+--:-:-:-:1      LOP.AND tidY, tid,  7;
+--:-:-:-:1      SHL     shiftX, tidY, 2;
+--:-:-:-:1      SHL     tidY,   tidY, 3;
+
+// trackA += ((blkA*16 + tidX) * lda + tidY) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidX, 4;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa, tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa, param_m, PT;
+
+// trackB += ((blkB*64 + tidX) * ldb + tidY) * 2
+04:-:-:-:1      ISCADD   txb00, blkB, tidX, 6;
+--:-:-:-:1      IADD     txb16, txb00, 16;
+--:-:-:-:1      IADD     txb32, txb00, 32;
+--:-:-:-:1      IADD     txb48, txb00, 48;
+--:-:-:-:1      XMAD.LO  tb00, ldb,  txb00, tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb00, ldbz, blkZ, tb00;
+--:-:-:-:1      IADD     tb16, tb00, ldb16;
+--:-:-:-:1      IADD     tb32, tb16, ldb16;
+--:-:-:-:1      IADD     tb48, tb32, ldb16;
+--:-:-:-:1      LEA      track0B0.CC, tb00, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb00, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track1B0.CC, tb16, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb16, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track2B0.CC, tb32, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb32, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track3B0.CC, tb48, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb48, param_B[1], RZ, 1;
+
+
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb16, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb48, param_n, PT;
+
+--:-:-:-:1      P2R preds, PR, RZ, 0x7c;
+
+// writeAs = (tidY*16 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 4;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftX;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (tidY*64 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 6;
+--:-:-:-:1      IADD   writeBs, writeBs, shiftX;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA>, 2;
+
+// readAs = (tid & 1) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid >> 1) & 7) << 4
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHL     readBs, readBs, 4;
+
+// tid16 = tid & -16
+// tid16_8 = tid16 / 2 * 4
+--:-:-:-:1      LOP.AND tid16, tid, -16;
+--:-:-:-:1      SHL     tid16_8, tid16, 1;
+
+// writeCs = (readAs + tid16*2) * 64 + readBs;
+--:-:-:-:1      ISCADD writeCs, tid16,   readAs, 1;
+--:-:-:-:1      ISCADD writeCs, writeCs, readBs, 6;
+
+// Each block of 16 threads works on 8 lines, shifted over by 4
+// readAs += tid16_8 * 16 + tid16
+// readBs += tid16_8 * 64 + tid16 + 4x<szShareA>
+--:-:-:-:1      ISCADD readAs, tid16_8, readAs, 4;
+--:-:-:-:1      ISCADD readBs, tid16_8, readBs, 6;
+--:-:-:-:1      IADD   readAs, tid16, readAs;
+--:-:-:-:1      IADD3  readBs, tid16, 4x<szShareA>, readBs;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareA + szShareB>;
+
+// If k is not a multiple of 64 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 64 then make a full 64 line fetch.
+--:-:-:-:1      LOP.AND.Z P0, partialK, k, 63;
+--:-:-:-:1  @P0 MOV partialK, 64;
+--:-:-:-:1      IADD k, k, -partialK;
+[+
+    our $vec;
+    return $vec ? q{
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x7c;
+--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c;
+
+<ORDERED>
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.128 load0B, [track0B];
+--:-:4:-:1  @P4 LDG.E.CI.128 load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load2B, [track2B];
+--:-:6:-:1  @P6 LDG.E.CI.128 load3B, [track3B];
+</ORDERED>
+
+<ORDERED>
+--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.128 load0B, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128 load1B, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.128 load2B, [addr_zero];
+--:-:1:-:1 @!P6 LDS.U.128 load3B, [addr_zero];
+</ORDERED>
+
+    } : q{
+--:-:-:-:1      IADD tidY1, tidY, 1;
+--:-:-:-:1      IADD tidY2, tidY, 2;
+--:-:-:-:1      IADD tidY3, tidY, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, partialK, PT;
+--:-:-:-:1      P2R predsY0, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD tidY,  tidY,  4;
+--:-:-:-:1      IADD tidY1, tidY1, 4;
+--:-:-:-:1      IADD tidY2, tidY2, 4;
+--:-:-:-:1      IADD tidY3, tidY3, 4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, partialK, PT;
+--:-:-:-:1      P2R predsY4, PR, RZ, 0x0f;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa, param_m, PT;
+--:-:-:-:1  @P4 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1  @P4 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A4, RZ;
+--:-:-:-:1 @!P1 MOV load0A5, RZ;
+--:-:-:-:1 @!P2 MOV load0A6, RZ;
+--:-:-:-:1 @!P3 MOV load0A7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb00, param_n, PT;
+--:-:-:-:1  @P5 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1  @P5 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B4, RZ;
+--:-:-:-:1 @!P1 MOV load0B5, RZ;
+--:-:-:-:1 @!P2 MOV load0B6, RZ;
+--:-:-:-:1 @!P3 MOV load0B7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb16, param_n, PT;
+--:-:-:-:1  @P6 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1  @P6 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B4, RZ;
+--:-:-:-:1 @!P1 MOV load1B5, RZ;
+--:-:-:-:1 @!P2 MOV load1B6, RZ;
+--:-:-:-:1 @!P3 MOV load1B7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb32, param_n, PT;
+--:-:-:-:1  @P4 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1  @P4 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load2B4, [track2B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load2B5, [track2B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load2B6, [track2B + 2x<6>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 load2B7, [track2B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B4, RZ;
+--:-:-:-:1 @!P1 MOV load2B5, RZ;
+--:-:-:-:1 @!P2 MOV load2B6, RZ;
+--:-:-:-:1 @!P3 MOV load2B7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb48, param_n, PT;
+--:-:-:-:1  @P6 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:6:-:1  @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1  @P6 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load3B4, [track3B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load3B5, [track3B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load3B6, [track3B + 2x<6>];
+--:-:6:-:1  @P3 LDG.E.CI.U16 load3B7, [track3B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B4, RZ;
+--:-:-:-:1 @!P1 MOV load3B5, RZ;
+--:-:-:-:1 @!P2 MOV load3B6, RZ;
+--:-:-:-:1 @!P3 MOV load3B7, RZ;
+    };
++]
+--:-:-:-:1      SHL partialK, partialK, 1;
+
+--:-:-:-:1      ISETP.GE.AND P0, PT, k, 64, PT;
+--:-:-:-:1      IADD k, k, -64;
+--:-:-:-:1  @P0 R2P PR, preds, 0x7c;
+--:-:-:-:1 @!P0 R2P PR, RZ, 0x7c;
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:1      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+    } : q{
+02:-:-:-:1      F2F.F32.F16 load0A7, load0A7;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A6;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A5;
+--:-:1:-:1      F2F.F32.F16 load0A4, load0A4;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A3;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0;
+    };
++]
+--:-:-:-:0      IADD   track0A0.CC, track0A0, partialK;
+01:-:-:-:1      STS [writeAs + 4x<7*16>], load0A7;
+--:-:-:-:1      STS [writeAs + 4x<6*16>], load0A6;
+--:-:-:-:1      STS [writeAs + 4x<5*16>], load0A5;
+--:-:-:-:1      STS [writeAs + 4x<4*16>], load0A4;
+02:-:-:-:1      STS [writeAs + 4x<3*16>], load0A3;
+--:-:-:-:1      STS [writeAs + 4x<2*16>], load0A2;
+--:-:-:-:1      STS [writeAs + 4x<1*16>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<0*16>], load0A0;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+04:-:-:-:1      F2F.F32.F16 load0B7, load0B3.H1;
+--:-:-:-:1      F2F.F32.F16 load0B6, load0B3.H0;
+--:-:-:-:1      F2F.F32.F16 load0B5, load0B2.H1;
+--:-:1:-:1      F2F.F32.F16 load0B4, load0B2.H0;
+--:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:2:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+    } : q{
+04:-:-:-:1      F2F.F32.F16 load0B7, load0B7;
+--:-:-:-:1      F2F.F32.F16 load0B6, load0B6;
+--:-:-:-:1      F2F.F32.F16 load0B5, load0B5;
+--:-:1:-:1      F2F.F32.F16 load0B4, load0B4;
+--:-:-:-:1      F2F.F32.F16 load0B3, load0B3;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:2:-:1      F2F.F32.F16 load0B0, load0B0;
+    };
++]
+--:-:-:-:0      IADD   track0B0.CC, track0B0, partialK;
+01:-:-:-:1      STS [writeBs + 4x<7*64 + 0*16>], load0B7;
+--:-:-:-:1      STS [writeBs + 4x<6*64 + 0*16>], load0B6;
+--:-:-:-:1      STS [writeBs + 4x<5*64 + 0*16>], load0B5;
+--:-:-:-:1      STS [writeBs + 4x<4*64 + 0*16>], load0B4;
+02:-:-:-:1      STS [writeBs + 4x<3*64 + 0*16>], load0B3;
+--:-:-:-:1      STS [writeBs + 4x<2*64 + 0*16>], load0B2;
+--:-:-:-:1      STS [writeBs + 4x<1*64 + 0*16>], load0B1;
+--:-:-:-:1      STS [writeBs + 4x<0*64 + 0*16>], load0B0;
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+08:-:-:-:1      F2F.F32.F16 load1B7, load1B3.H1;
+--:-:-:-:1      F2F.F32.F16 load1B6, load1B3.H0;
+--:-:-:-:1      F2F.F32.F16 load1B5, load1B2.H1;
+--:-:1:-:1      F2F.F32.F16 load1B4, load1B2.H0;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+    } : q{
+08:-:-:-:1      F2F.F32.F16 load1B7, load1B7;
+--:-:-:-:1      F2F.F32.F16 load1B6, load1B6;
+--:-:-:-:1      F2F.F32.F16 load1B5, load1B5;
+--:-:1:-:1      F2F.F32.F16 load1B4, load1B4;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B3;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0;
+    };
++]
+--:-:-:-:0      IADD   track1B0.CC, track1B0, partialK;
+01:-:-:-:1      STS [writeBs + 4x<7*64 + 1*16>], load1B7;
+--:-:-:-:1      STS [writeBs + 4x<6*64 + 1*16>], load1B6;
+--:-:-:-:1      STS [writeBs + 4x<5*64 + 1*16>], load1B5;
+--:-:-:-:1      STS [writeBs + 4x<4*64 + 1*16>], load1B4;
+02:-:-:-:1      STS [writeBs + 4x<3*64 + 1*16>], load1B3;
+--:-:-:-:1      STS [writeBs + 4x<2*64 + 1*16>], load1B2;
+--:-:-:-:1      STS [writeBs + 4x<1*64 + 1*16>], load1B1;
+--:-:-:-:1      STS [writeBs + 4x<0*64 + 1*16>], load1B0;
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+10:-:-:-:1      F2F.F32.F16 load2B7, load2B3.H1;
+--:-:-:-:1      F2F.F32.F16 load2B6, load2B3.H0;
+--:-:-:-:1      F2F.F32.F16 load2B5, load2B2.H1;
+--:-:1:-:1      F2F.F32.F16 load2B4, load2B2.H0;
+--:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:2:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+    } : q{
+10:-:-:-:1      F2F.F32.F16 load2B7, load2B7;
+--:-:-:-:1      F2F.F32.F16 load2B6, load2B6;
+--:-:-:-:1      F2F.F32.F16 load2B5, load2B5;
+--:-:1:-:1      F2F.F32.F16 load2B4, load2B4;
+--:-:-:-:1      F2F.F32.F16 load2B3, load2B3;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:2:-:1      F2F.F32.F16 load2B0, load2B0;
+    };
++]
+--:-:-:-:0      IADD   track2B0.CC, track2B0, partialK;
+01:-:-:-:1      STS [writeBs + 4x<7*64 + 2*16>], load2B7;
+--:-:-:-:1      STS [writeBs + 4x<6*64 + 2*16>], load2B6;
+--:-:-:-:1      STS [writeBs + 4x<5*64 + 2*16>], load2B5;
+--:-:-:-:1      STS [writeBs + 4x<4*64 + 2*16>], load2B4;
+02:-:-:-:1      STS [writeBs + 4x<3*64 + 2*16>], load2B3;
+--:-:-:-:1      STS [writeBs + 4x<2*64 + 2*16>], load2B2;
+--:-:-:-:1      STS [writeBs + 4x<1*64 + 2*16>], load2B1;
+--:-:-:-:1      STS [writeBs + 4x<0*64 + 2*16>], load2B0;
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+20:-:-:-:1      F2F.F32.F16 load3B7, load3B3.H1;
+--:-:-:-:1      F2F.F32.F16 load3B6, load3B3.H0;
+--:-:-:-:1      F2F.F32.F16 load3B5, load3B2.H1;
+--:-:1:-:1      F2F.F32.F16 load3B4, load3B2.H0;
+--:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:2:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+    } : q{
+20:-:-:-:1      F2F.F32.F16 load3B7, load3B7;
+--:-:-:-:1      F2F.F32.F16 load3B6, load3B6;
+--:-:-:-:1      F2F.F32.F16 load3B5, load3B5;
+--:-:1:-:1      F2F.F32.F16 load3B4, load3B4;
+--:-:-:-:1      F2F.F32.F16 load3B3, load3B3;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:2:-:1      F2F.F32.F16 load3B0, load3B0;
+    };
++]
+--:-:-:-:0      IADD   track3B0.CC, track3B0, partialK;
+01:-:-:-:1      STS [writeBs + 4x<7*64 + 3*16>], load3B7;
+--:-:-:-:1      STS [writeBs + 4x<6*64 + 3*16>], load3B6;
+--:-:-:-:1      STS [writeBs + 4x<5*64 + 3*16>], load3B5;
+--:-:-:-:1      STS [writeBs + 4x<4*64 + 3*16>], load3B4;
+02:-:-:-:1      STS [writeBs + 4x<3*64 + 3*16>], load3B3;
+--:-:-:-:1      STS [writeBs + 4x<2*64 + 3*16>], load3B2;
+--:-:-:-:1      STS [writeBs + 4x<1*64 + 3*16>], load3B1;
+--:-:-:-:1      STS [writeBs + 4x<0*64 + 3*16>], load3B0;
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*16 + 00>];
+--:-:-:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*16 + 08>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>];
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.128 load0B, [track0B];
+--:-:4:-:1  @P4 LDG.E.CI.128 load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load2B, [track2B];
+--:-:6:-:1  @P6 LDG.E.CI.128 load3B, [track3B];
+    } : q{
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];
+
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];
+--:-:4:-:1  @P4 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];
+
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B4, [track2B + 2x<4>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B5, [track2B + 2x<5>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load2B6, [track2B + 2x<6>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2B7, [track2B + 2x<7>];
+
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B4, [track3B + 2x<4>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B5, [track3B + 2x<5>];
+--:-:-:-:1  @P6 LDG.E.CI.U16 load3B6, [track3B + 2x<6>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 load3B7, [track3B + 2x<7>];
+    };
++]
+
+LOOP:
+
+[+
+    our $vec;
+    our %insert =
+    (
+        j0c8   => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, RZ, PT;\n",
+        j0c10  => "--:-:-:-:1      ISETP.GE.AND P1, PT, k, 64, PT;\n" .
+                  "--:-:-:-:1      IADD k, k, -64;\n",
+
+        j0c23  => "--:-:-:-:1  \@P1 R2P PR, preds, 0x7c;\n",
+        j0c24  => "--:-:-:-:1 \@!P1 R2P PR, RZ,    0x7c;\n",
+
+        j2c32  => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<64>;\n",
+        j2c37  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j3c32  => "--:-:-:-:1  \@P3 IADD   track0B0.CC, track0B0, 2x<64>;\n",
+        j3c37  => "--:-:-:-:1  \@P3 IADD.X track0B1,    track0B1, RZ;\n",
+        j4c32  => "--:-:-:-:1  \@P4 IADD   track1B0.CC, track1B0, 2x<64>;\n",
+        j4c37  => "--:-:-:-:1  \@P4 IADD.X track1B1,    track1B1, RZ;\n",
+        j5c32  => "--:-:-:-:1  \@P5 IADD   track2B0.CC, track2B0, 2x<64>;\n",
+        j5c37  => "--:-:-:-:1  \@P5 IADD.X track2B1,    track2B1, RZ;\n",
+        j6c32  => "--:-:-:-:1  \@P6 IADD   track3B0.CC, track3B0, 2x<64>;\n",
+        j6c37  => "--:-:-:-:1  \@P6 IADD.X track3B1,    track3B1, RZ;\n",
+
+        j6c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j1c63 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j2c36 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B7, load0B3.H1;\n",
+                j2c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B6, load0B3.H0;\n",
+                j2c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B5, load0B2.H1;\n",
+                j2c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B4, load0B2.H0;\n",
+                j2c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j3c36 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B7, load1B3.H1;\n",
+                j3c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B6, load1B3.H0;\n",
+                j3c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B5, load1B2.H1;\n",
+                j3c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B4, load1B2.H0;\n",
+                j3c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j4c36 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B7, load2B3.H1;\n",
+                j4c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B6, load2B3.H0;\n",
+                j4c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B5, load2B2.H1;\n",
+                j4c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B4, load2B2.H0;\n",
+                j4c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n",
+
+                j5c36 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B7, load3B3.H1;\n",
+                j5c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B6, load3B3.H0;\n",
+                j5c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B5, load3B2.H1;\n",
+                j5c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B4, load3B2.H0;\n",
+                j5c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j2c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<7*16>], load0A7;\n",
+                j2c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*16>], load0A6;\n",
+                j2c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*16>], load0A5;\n",
+                j2c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*16>], load0A4;\n",
+                j2c24  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*16>], load0A3;\n",
+                j2c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*16>], load0A2;\n",
+                j2c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*16>], load0A1;\n",
+                j2c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<0*16>], load0A0;\n",
+
+                j3c16  => "04:-:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 0*16>], load0B7;\n",
+                j3c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 0*16>], load0B6;\n",
+                j3c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 0*16>], load0B5;\n",
+                j3c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 0*16>], load0B4;\n",
+                j3c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 0*16>], load0B3;\n",
+                j3c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 0*16>], load0B2;\n",
+                j3c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 0*16>], load0B1;\n",
+                j3c30  => "--:3:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 0*16>], load0B0;\n",
+
+                j4c16  => "08:-:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 1*16>], load1B7;\n",
+                j4c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 1*16>], load1B6;\n",
+                j4c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 1*16>], load1B5;\n",
+                j4c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 1*16>], load1B4;\n",
+                j4c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 1*16>], load1B3;\n",
+                j4c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 1*16>], load1B2;\n",
+                j4c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 1*16>], load1B1;\n",
+                j4c30  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 1*16>], load1B0;\n",
+
+                j5c16  => "10:-:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 2*16>], load2B7;\n",
+                j5c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 2*16>], load2B6;\n",
+                j5c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 2*16>], load2B5;\n",
+                j5c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 2*16>], load2B4;\n",
+                j5c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 2*16>], load2B3;\n",
+                j5c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 2*16>], load2B2;\n",
+                j5c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 2*16>], load2B1;\n",
+                j5c30  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 2*16>], load2B0;\n",
+
+                j6c16  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 3*16>], load3B7;\n",
+                j6c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 3*16>], load3B6;\n",
+                j6c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 3*16>], load3B5;\n",
+                j6c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 3*16>], load3B4;\n",
+                j6c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 3*16>], load3B3;\n",
+                j6c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 3*16>], load3B2;\n",
+                j6c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 3*16>], load3B1;\n",
+                j6c30  => "--:6:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 3*16>], load3B0;\n",
+
+                j2c61 => "02:-:2:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j3c61 => "04:-:3:-:1  \@P3 LDG.E.CI.128 load0B, [track0B];\n",
+                j4c61 => "08:-:4:-:1  \@P4 LDG.E.CI.128 load1B, [track1B];\n",
+                j5c61 => "10:-:5:-:1  \@P5 LDG.E.CI.128 load2B, [track2B];\n",
+                j6c61 => "20:-:6:-:1  \@P6 LDG.E.CI.128 load3B, [track3B];\n",
+            ) :
+            (
+                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A0, load0A0;\n",
+                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A1;\n",
+                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A2;\n",
+                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A3;\n",
+                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A4;\n",
+                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A5;\n",
+                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A6;\n",
+                j1c63 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A7, load0A7;\n",
+
+                j2c36 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B3;\n",
+                j2c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B4, load0B4;\n",
+                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B5, load0B5;\n",
+                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B6, load0B6;\n",
+                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B7, load0B7;\n",
+
+                j3c36 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B0, load1B0;\n",
+                j3c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B1;\n",
+                j3c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B2;\n",
+                j3c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B3;\n",
+                j3c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B4, load1B4;\n",
+                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B5, load1B5;\n",
+                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B6, load1B6;\n",
+                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B7, load1B7;\n",
+
+                j4c36 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B0, load2B0;\n",
+                j4c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B1;\n",
+                j4c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B2;\n",
+                j4c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B3;\n",
+                j4c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B4, load2B4;\n",
+                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B5, load2B5;\n",
+                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B6, load2B6;\n",
+                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B7, load2B7;\n",
+
+                j5c36 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B0, load3B0;\n",
+                j5c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B1;\n",
+                j5c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B2;\n",
+                j5c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B3, load3B3;\n",
+                j5c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B4, load3B4;\n",
+                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B5, load3B5;\n",
+                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B6, load3B6;\n",
+                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B7, load3B7;\n",
+
+                j2c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<0*16>], load0A0;\n",
+                j2c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*16>], load0A1;\n",
+                j2c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*16>], load0A2;\n",
+                j2c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*16>], load0A3;\n",
+                j2c24  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*16>], load0A4;\n",
+                j2c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*16>], load0A5;\n",
+                j2c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*16>], load0A6;\n",
+                j2c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<7*16>], load0A7;\n",
+
+                j3c16  => "04:-:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 0*16>], load0B0;\n",
+                j3c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 0*16>], load0B1;\n",
+                j3c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 0*16>], load0B2;\n",
+                j3c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 0*16>], load0B3;\n",
+                j3c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 0*16>], load0B4;\n",
+                j3c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 0*16>], load0B5;\n",
+                j3c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 0*16>], load0B6;\n",
+                j3c30  => "--:3:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 0*16>], load0B7;\n",
+
+                j4c16  => "08:-:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 1*16>], load1B0;\n",
+                j4c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 1*16>], load1B1;\n",
+                j4c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 1*16>], load1B2;\n",
+                j4c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 1*16>], load1B3;\n",
+                j4c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 1*16>], load1B4;\n",
+                j4c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 1*16>], load1B5;\n",
+                j4c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 1*16>], load1B6;\n",
+                j4c30  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 1*16>], load1B7;\n",
+
+                j5c16  => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 2*16>], load2B0;\n",
+                j5c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 2*16>], load2B1;\n",
+                j5c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 2*16>], load2B2;\n",
+                j5c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 2*16>], load2B3;\n",
+                j5c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 2*16>], load2B4;\n",
+                j5c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 2*16>], load2B5;\n",
+                j5c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 2*16>], load2B6;\n",
+                j5c30  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 2*16>], load2B7;\n",
+
+                j6c16  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<0*64 + 3*16>], load3B0;\n",
+                j6c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*64 + 3*16>], load3B1;\n",
+                j6c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*64 + 3*16>], load3B2;\n",
+                j6c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*64 + 3*16>], load3B3;\n",
+                j6c24  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*64 + 3*16>], load3B4;\n",
+                j6c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*64 + 3*16>], load3B5;\n",
+                j6c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*64 + 3*16>], load3B6;\n",
+                j6c30  => "--:6:-:-:1  \@P0 STS [writeBs + 4x<7*64 + 3*16>], load3B7;\n",
+
+                j2c48 => "02:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j2c50 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j2c52 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j2c54 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+                j2c56 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n",
+                j2c58 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n",
+                j2c60 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n",
+                j2c62 => "--:-:2:-:1  \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n",
+
+                j3c48 => "04:-:-:-:1  \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c50 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j3c52 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j3c54 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+                j3c56 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];\n",
+                j3c58 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];\n",
+                j3c60 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];\n",
+                j3c62 => "--:-:3:-:1  \@P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];\n",
+
+                j4c48 => "08:-:-:-:1  \@P4 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j4c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j4c52 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j4c54 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+                j4c56 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];\n",
+                j4c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];\n",
+                j4c60 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];\n",
+                j4c62 => "--:-:4:-:1  \@P4 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];\n",
+
+                j5c48 => "10:-:-:-:1  \@P5 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j5c50 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j5c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j5c54 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+                j5c56 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B4, [track2B + 2x<4>];\n",
+                j5c58 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B5, [track2B + 2x<5>];\n",
+                j5c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B6, [track2B + 2x<6>];\n",
+                j5c62 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load2B7, [track2B + 2x<7>];\n",
+
+                j6c48 => "20:-:-:-:1  \@P6 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j6c50 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j6c52 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j6c54 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+                j6c56 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B4, [track3B + 2x<4>];\n",
+                j6c58 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B5, [track3B + 2x<5>];\n",
+                j6c60 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 load3B6, [track3B + 2x<6>];\n",
+                j6c62 => "--:-:6:-:1  \@P6 LDG.E.CI.U16 load3B7, [track3B + 2x<7>];\n",
+            )
+        ),
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out = '';
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*16 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*16 + 08>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4
+--:-:-:-:1      LOP.AND tid15, tid, 15;
+--:-:-:-:1      SHR.U32 tid16, tid, 4;
+--:-:-:-:1      SHL     tid15, tid15, 2;
+--:-:-:-:1      ISCADD readCs, tid16, tid15, 6;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*64 + tid15;
+--:-:-:-:1      ISCADD cx, blkB, tid15, 6;
+--:-:-:-:1      IADD   cx1, cx, 1;
+--:-:-:-:1      IADD   cx2, cx, 2;
+--:-:-:-:1      IADD   cx3, cx, 3;
+
+// cy = blkA*16 + tid16
+--:-:-:-:1      ISCADD cy, blkA, tid16, 4;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      SHL  ldc8, ldc, 4;
+
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C1,    ci, param_C[1], RZ, 1;
+
+// P0 = cx < n
+--:-:-:-:1      ISETP.LT.AND P0, PT, cx,  param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cx1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cx2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cx3, param_n, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+// P4 = cy < m
+--:-:-:-:1      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+// P5 = beta != 0 && P4
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P4;
+
+// P6 = Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+
+// Init beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:1:-:1  @P0 LDG.E.64 loadC, [C];
+    } : q{
+--:-:-:-:0 @!P0 MOV loadC0, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>];
+--:-:-:-:0 @!P1 MOV loadC1, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>];
+--:-:-:-:0 @!P2 MOV loadC2, RZ;
+--:-:-:-:1  @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>];
+--:-:-:-:0 @!P3 MOV loadC3, RZ;
+--:-:1:-:1  @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>];
+    };
++]
+
+// Restore output preds
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+
+--:-:-:-:1      LDS.U.128 part0C, [readCs + 4x<0*8*64>];
+--:-:2:-:1      LDS.U.128 part1C, [readCs + 4x<1*8*64>];
+--:-:-:-:1      LDS.U.128 part2C, [readCs + 4x<2*8*64>];
+--:-:3:-:1      LDS.U.128 part3C, [readCs + 4x<3*8*64>];
+--:-:-:-:1      LDS.U.128 part4C, [readCs + 4x<4*8*64>];
+--:-:4:-:1      LDS.U.128 part5C, [readCs + 4x<5*8*64>];
+--:-:-:-:1      LDS.U.128 part6C, [readCs + 4x<6*8*64>];
+--:-:5:-:1      LDS.U.128 part7C, [readCs + 4x<7*8*64>];
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1  @P0 FADD part0C0, part0C0, part1C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part1C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part1C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part1C3;
+
+04:-:-:-:1  @P0 FADD part2C0, part2C0, part3C0;
+--:-:-:-:1  @P1 FADD part2C1, part2C1, part3C1;
+--:-:-:-:1  @P2 FADD part2C2, part2C2, part3C2;
+--:-:-:-:1  @P3 FADD part2C3, part2C3, part3C3;
+
+08:-:-:-:1  @P0 FADD part4C0, part4C0, part5C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part5C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part5C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part5C3;
+
+10:-:-:-:1  @P0 FADD part6C0, part6C0, part7C0;
+--:-:-:-:1  @P1 FADD part6C1, part6C1, part7C1;
+--:-:-:-:1  @P2 FADD part6C2, part6C2, part7C2;
+--:-:-:-:1  @P3 FADD part6C3, part6C3, part7C3;
+
+--:-:-:-:1  @P0 FADD part0C0, part0C0, part2C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part2C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part2C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part2C3;
+
+--:-:-:-:1  @P0 FADD part4C0, part4C0, part6C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part6C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part6C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part6C3;
+
+--:-:-:-:1  @P0 FADD c0, part0C0, part4C0;
+--:-:-:-:1  @P1 FADD c1, part0C1, part4C1;
+--:-:-:-:1  @P2 FADD c2, part0C2, part4C2;
+--:-:-:-:1  @P3 FADD c3, part0C3, part4C3;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD cy, cy, 8;
+
+[+
+    our $vec;
+    return $vec ? q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0.H0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC0.H1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC1.H0;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC1.H1;
+    } : q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC2;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC3;
+    };
++]
+
+01:-:-:-:1  @P5 FFMA c0, b0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, b1, beta, c1;
+04:-:-:-:1  @P5 FFMA c2, b2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, b3, beta, c3;
+
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, cy, param_m, P5;
+
+--:-:1:-:1  @P0 F2F.F16.F32 c0, c0;
+--:-:2:-:1  @P1 F2F.F16.F32 c1, c1;
+
+--:-:-:-:0      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+--:-:3:-:1  @P2 F2F.F16.F32 c2, c2;
+--:-:4:-:1  @P3 F2F.F16.F32 c3, c3;
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:2  @P0 BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2  @P0 BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:1  @P0 STG.E.CG.64 [C], c;
+    } : q{
+01:-:-:-:1  @P0 STG.E.U16 [C + 2x<0>], c0;
+02:-:-:-:1  @P1 STG.E.U16 [C + 2x<1>], c1;
+04:-:-:-:1  @P2 STG.E.U16 [C + 2x<2>], c2;
+08:1:-:-:1  @P3 STG.E.U16 [C + 2x<3>], c3;
+    };
++]
+
+// Restore beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+01:-:-:-:6      IADD   C0.CC, C0, ldc8;
+--:-:-:-:0      IADD.X C1,    C1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/hgemm_nt_32x128.sass b/Kernel/SGEMM/Pascal/hgemm_nt_32x128.sass
new file mode 100644
index 0000000..eef6e5e
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nt_32x128.sass
@@ -0,0 +1,588 @@
+# Kernel: hgemm_nt_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(128*16 + 32)*2 + (32*16 + 32)*2>
+    szShareA : (32*16 + 32)
+    szShareB : (128*16 + 32)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ tidX, lda, ldb, ldaz, ldbz, ldb32, tid1, tid3, tid96, ta, tb00, tb32, tb64, tb96, xmad_ta, xmad_tb, shiftX, tidY<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadA<0-3>
+      84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>
+
+    100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    110-120 ~ writeAs, writeBs, k, tidY, txa, txb00, txb32, txb64, txb96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+--:-:3:-:1      S2R blkA, SR_CTAID.Y;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb32, ldb, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidX   = tid >> 2
+// tidY   = (tid & 3) << 2
+// shiftX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidX, tid,  2;
+01:-:-:-:1      LOP.AND tid3, tid,  3;
+--:-:-:-:1      SHL     tidY, tid3, 2;
+--:-:-:-:1      SHL     shiftX, tid3, 3;
+
+// trackA += ((blkA*32 + tidX) * lda + tidAY) * 2
+04:-:-:-:1      ISCADD   txa, blkA, tidX, 5;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 1;
+
+// trackB += ((blkB*128 + tidX) * ldb + tidY) * 2
+02:-:-:-:1      ISCADD txb00, blkB, tidX, 7;
+--:-:-:-:1      IADD   txb32, txb00, 32;
+--:-:-:-:1      IADD   txb64, txb00, 64;
+--:-:-:-:1      IADD   txb96, txb00, 96;
+
+--:-:-:-:1      XMAD.LO  tb00, ldb,  txb00, tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb00, ldbz, blkZ, tb00;
+--:-:-:-:1      IADD     tb32, tb00, ldb32;
+--:-:-:-:1      IADD     tb64, tb32, ldb32;
+--:-:-:-:1      IADD     tb96, tb64, ldb32;
+
+--:-:-:-:1      LEA      track0B0.CC, tb00, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb00, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track1B0.CC, tb32, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb32, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track2B0.CC, tb64, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track2B1,    tb64, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track3B0.CC, tb96, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track3B1,    tb96, param_B[1], RZ, 1;
+
+// writeAs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeBs, writeBs, shiftX;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+01:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readBs, readBs, tid96;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb64, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb96, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa,   param_m, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.CI.64 load0B, [track0B];
+--:-:2:-:1  @P3 LDG.E.CI.64 load1B, [track1B];
+--:-:3:-:1  @P4 LDG.E.CI.64 load2B, [track2B];
+--:-:4:-:1  @P5 LDG.E.CI.64 load3B, [track3B];
+--:-:5:-:1  @P6 LDG.E.CI.64 loadA,  [trackA];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.64 load0B, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 load1B, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.64 load2B, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.64 load3B, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.64 loadA,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidY1, tidY, 1;
+--:-:-:-:1      IADD tidY2, tidY, 2;
+--:-:-:-:1      IADD tidY3, tidY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:1:-:1  @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:1:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:1:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb64, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb96, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];
+--:-:5:-:1  @P1 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];
+--:-:5:-:1  @P2 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb32, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P0, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P0;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+21:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:1:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+
+02:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+
+04:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
+--:-:3:-:1      F2F.F32.F16 load2B0, load2B0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
+--:-:4:-:1      F2F.F32.F16 load3B0, load3B0.H0;
+
+10:-:-:-:1      F2F.F32.F16 loadA3, loadA1.H1;
+--:-:-:-:1      F2F.F32.F16 loadA2, loadA1.H0;
+--:-:-:-:1      F2F.F32.F16 loadA1, loadA0.H1;
+--:-:5:-:1      F2F.F32.F16 loadA0, loadA0.H0;
+    } : q{
+21:-:-:-:1      F2F.F32.F16 load0B0, load0B0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:1:-:1      F2F.F32.F16 load0B3, load0B3;
+
+02:-:-:-:1      F2F.F32.F16 load1B0, load1B0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:2:-:1      F2F.F32.F16 load1B3, load1B3;
+
+04:-:-:-:1      F2F.F32.F16 load2B0, load2B0;
+--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
+--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
+--:-:3:-:1      F2F.F32.F16 load2B3, load2B3;
+
+08:-:-:-:1      F2F.F32.F16 load3B0, load3B0;
+--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
+--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
+--:-:4:-:1      F2F.F32.F16 load3B3, load3B3;
+
+10:-:-:-:1      F2F.F32.F16 loadA0, loadA0;
+--:-:-:-:1      F2F.F32.F16 loadA1, loadA1;
+--:-:-:-:1      F2F.F32.F16 loadA2, loadA2;
+--:-:5:-:1      F2F.F32.F16 loadA3, loadA3;
+    };
+</CODE>
+
+01:-:-:-:1      STS [writeBs + 4x<0*128 + 0*32>], load0B0;
+--:-:-:-:0      IADD   track0B0.CC, track0B0, 2x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 0*32>], load0B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 0*32>], load0B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 0*32>], load0B3;
+
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+02:-:-:-:1      STS [writeBs + 4x<0*128 + 1*32>], load1B0;
+--:-:-:-:0      IADD   track1B0.CC, track1B0, 2x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 1*32>], load1B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 1*32>], load1B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 1*32>], load1B3;
+
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+04:-:-:-:1      STS [writeBs + 4x<0*128 + 2*32>], load2B0;
+--:-:-:-:0      IADD   track2B0.CC, track2B0, 2x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 2*32>], load2B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 2*32>], load2B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 2*32>], load2B3;
+
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+08:-:-:-:1      STS [writeBs + 4x<0*128 + 3*32>], load3B0;
+--:-:-:-:0      IADD   track3B0.CC, track3B0, 2x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 3*32>], load3B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 3*32>], load3B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 3*32>], load3B3;
+
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+10:-:-:-:1      STS [writeAs + 4x<0*32>], loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 2x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*32>], loadA3;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.64 load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.64 load1B, [track1B];
+--:-:5:-:1  @P4 LDG.E.CI.64 load2B, [track2B];
+--:-:5:-:1  @P5 LDG.E.CI.64 load3B, [track3B];
+--:-:6:-:1  @P6 LDG.E.CI.64 loadA,  [trackA];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
+--:-:5:-:1  @P4 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 1;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 0*32>], load0B0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 0*32>], load0B1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 0*32>], load0B2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 0*32>], load0B3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 1*32>], load1B0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 1*32>], load1B1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 1*32>], load1B2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 1*32>], load1B3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 2*32>], load2B0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 2*32>], load2B1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 2*32>], load2B2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 2*32>], load2B3;\n",
+
+        j9c6   => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 3*32>], load3B0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 3*32>], load3B1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 3*32>], load3B2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 3*32>], load3B3;\n",
+
+        j11c6  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<0*32>], loadA0;\n",
+        j11c8  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], loadA1;\n",
+        j11c10 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], loadA2;\n",
+        j11c12 => "--:6:-:-:1  \@P0 STS [writeAs + 4x<3*32>], loadA3;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0B0.CC, track0B0, 2x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0B1,    track0B1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, 2x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2B0.CC, track2B0, 2x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2B1,    track2B1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3B0.CC, track3B0, 2x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3B1,    track3B1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackA0.CC, trackA0, 2x<16>;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.64 load0B, [track0B];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.64 load1B, [track1B];\n",
+                j9c29  => "10:-:5:-:1  \@P4 LDG.E.CI.64 load2B, [track2B];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.64 load3B, [track3B];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.64 loadA,  [trackA];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j6c13  => "10:-:-:-:1  \@P4 F2F.F32.F16 load2B3, load2B1.H1;\n",
+                j6c17  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2B2, load2B1.H0;\n",
+                j6c21  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2B1, load2B0.H1;\n",
+                j6c25  => "--:-:5:-:1  \@P4 F2F.F32.F16 load2B0, load2B0.H0;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B3, load3B1.H1;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B2, load3B1.H0;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B1, load3B0.H1;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3B0, load3B0.H0;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadA3, loadA1.H1;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA2, loadA1.H0;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA1, loadA0.H1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA0, loadA0.H0;\n",
+            ) :
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j3c31  => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j4c1   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j5c31  => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j6c1   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+
+                j9c29  => "10:-:5:-:1  \@P4 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
+                j9c31  => "--:-:5:-:1  \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
+                j10c1  => "--:-:5:-:1  \@P4 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
+                j10c3  => "--:-:5:-:1  \@P4 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",
+
+                j10c8  => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
+                j10c10 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
+                j10c12 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
+
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.U16 loadA0, [trackA + 2x<0>];\n",
+                j11c31 => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadA1, [trackA + 2x<1>];\n",
+                j12c1  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadA2, [trackA + 2x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadA3, [trackA + 2x<3>];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0B0, load0B0;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B1, load0B1;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0B2, load0B2;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0B3, load0B3;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1B0, load1B0;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B1, load1B1;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1B2, load1B2;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1B3, load1B3;\n",
+
+                j6c13  => "10:-:-:-:1  \@P4 F2F.F32.F16 load2B0, load2B0;\n",
+                j6c17  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2B1, load2B1;\n",
+                j6c21  => "--:-:-:-:1  \@P4 F2F.F32.F16 load2B2, load2B2;\n",
+                j6c25  => "--:-:5:-:1  \@P4 F2F.F32.F16 load2B3, load2B3;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B0, load3B0;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B1, load3B1;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3B2, load3B2;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3B3, load3B3;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadA0, loadA0;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA1, loadA1;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadA2, loadA2;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadA3, loadA3;\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="hgemm_common_32x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/hgemm_nt_32x32.sass b/Kernel/SGEMM/Pascal/hgemm_nt_32x32.sass
new file mode 100644
index 0000000..1225d7d
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_nt_32x32.sass
@@ -0,0 +1,1067 @@
+# Kernel: hgemm_nt_32x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 16x<32*65>
+    szShareA   : (32*65)
+    szShareB   : (32*65)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+       0-63 : czero<00-63>
+      64-79 : j0Ay<0-7>, j0Bx<0-7>
+      80-95 : j1Ay<0-7>, j1Bx<0-7>
+
+      64-95 ~ tidX, tidY, tidY<1-3>, lda, ldb, ldaz, ldbz, lda16, ldb16, tid1, tid16, tid16_8, ta<00|16>, txa<00|16>, tb<00|16>, txb<00|16>, xmad_ta, xmad_tb, shiftX, predsY0, predsY4, partialK
+
+     96-127 :  load0A<0-7>,  load1A<0-7>,  load0B<0-7>,  load1B<0-7>
+    128-135 : track0A<0-1>, track1A<0-1>, track0B<0-1>, track1B<0-1>
+
+    136-142 ~ swapBuf, readAs, readBs, writeAs, writeBs, k
+    143-149 ~ tid, blkA, blkB, blkZ, writeCs, preds
+
+       0-31 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>, part4C<0-3>, part5C<0-3>, part6C<0-3>, part7C<0-3>
+      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
+      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
+      96-99 : loadC<0-3>
+    100-103 : b<0-3>
+    104-107 : c<0-3>
+    108-109 : C<0-1>
+    110-142 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc16, readCs, alpha, beta, flags, tid7, tid8
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL lda16, lda, 4;
+--:-:-:-:1      SHL ldb16, ldb, 4;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]
+
+// tidX   = tid >> 3
+// tidY   = (tid & 7) << 3
+// shiftX = (tid & 7) << 2
+01:-:-:-:1      SHR.U32 tidX, tid,  3;
+--:-:-:-:1      LOP.AND tidY, tid,  7;
+--:-:-:-:1      SHL     shiftX, tidY, 2;
+--:-:-:-:1      SHL     tidY,   tidY, 3;
+
+// trackA += ((blkA*32 + tidX) * lda + tidY) * 2
+02:-:-:-:1      ISCADD   txa00, blkA, tidX, 5;
+--:-:-:-:1      IADD     txa16, txa00, 16;
+--:-:-:-:1      XMAD.LO  ta00, lda,  txa00, tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta00, ldaz, blkZ, ta00;
+--:-:-:-:1      IADD     ta16, ta00, lda16;
+--:-:-:-:1      LEA      track0A0.CC, ta00, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta00, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track1A0.CC, ta16, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track1A1,    ta16, param_A[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa16, param_m, PT;
+
+// trackB += ((blkB*32 + tidX) * ldb + tidY) * 2
+04:-:-:-:1      ISCADD   txb00, blkB, tidX, 5;
+--:-:-:-:1      IADD     txb16, txb00, 16;
+--:-:-:-:1      XMAD.LO  tb00, ldb,  txb00, tidY, xmad_tb;
+--:-:-:-:1      XMAD.LO2 tb00, ldbz, blkZ, tb00;
+--:-:-:-:1      IADD     tb16, tb00, ldb16;
+--:-:-:-:1      LEA      track0B0.CC, tb00, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb00, param_B[1], RZ, 1;
+--:-:-:-:1      LEA      track1B0.CC, tb16, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X track1B1,    tb16, param_B[1], RZ, 1;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb16, param_n, PT;
+
+--:-:-:-:1      P2R preds, PR, RZ, 0x3c;
+
+// writeAs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftX;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeBs, writeBs, shiftX;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA>, 2;
+
+
+// readAs = (((tid & 8) >> 2) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    8;
+--:-:-:-:1      SHR.U32 readAs, readAs, 2;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid >> 1) & 3) << 4
+--:-:-:-:1      BFE.U32 readBs, tid,    0x201; // 2 bits at position 1
+--:-:-:-:1      SHL     readBs, readBs, 4;
+
+// tid16 = tid & -16
+// tid16_8 = tid16 / 2 * 4
+--:-:-:-:1      LOP.AND tid16, tid, -16;
+--:-:-:-:1      SHL     tid16_8, tid16, 1;
+
+// writeCs = (readAs + tid16*4) * 32 + readBs;
+--:-:-:-:1      ISCADD writeCs, tid16,   readAs, 2;
+--:-:-:-:1      ISCADD writeCs, writeCs, readBs, 5;
+
+// Each block of 16 threads works on 8 lines, shifted over by 4
+// readAs += tid16_8 * 32 + tid16
+// readBs += tid16_8 * 32 + tid16 + 4x<szShareA>
+--:-:-:-:1      ISCADD readAs, tid16_8, readAs, 5;
+--:-:-:-:1      ISCADD readBs, tid16_8, readBs, 5;
+--:-:-:-:1      IADD   readAs, tid16, readAs;
+--:-:-:-:1      IADD3  readBs, tid16, 4x<szShareA>, readBs;
+
+--:-:-:-:1      MOV32I swapBuf, 4x<szShareA + szShareB>;
+
+// If k is not a multiple of 64 we want to grab the partial amount on the first fetch.
+// If it is a multiple of 64 then make a full 64 line fetch.
+--:-:-:-:1      LOP.AND.Z P0, partialK, k, 63;
+--:-:-:-:1  @P0 MOV partialK, 64;
+--:-:-:-:1      IADD k, k, -partialK;
+[+
+    our $vec;
+    return $vec ? q{
+
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, partialK, PT;
+--:-:-:-:1  @P1 R2P PR, preds, 0x3c;
+--:-:-:-:1 @!P1 R2P PR, RZ, 0x3c;
+
+<ORDERED>
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.128 load1A, [track1A];
+--:-:4:-:1  @P4 LDG.E.CI.128 load0B, [track0B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load1B, [track1B];
+</ORDERED>
+
+<ORDERED>
+--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:-:-:1 @!P3 LDS.U.128 load1A, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128 load0B, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 load1B, [addr_zero];
+</ORDERED>
+
+    } : q{
+--:-:-:-:1      IADD tidY1, tidY, 1;
+--:-:-:-:1      IADD tidY2, tidY, 2;
+--:-:-:-:1      IADD tidY3, tidY, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, partialK, PT;
+--:-:-:-:1      P2R predsY0, PR, RZ, 0x0f;
+
+--:-:-:-:1      IADD tidY,  tidY,  4;
+--:-:-:-:1      IADD tidY1, tidY1, 4;
+--:-:-:-:1      IADD tidY2, tidY2, 4;
+--:-:-:-:1      IADD tidY3, tidY3, 4;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, partialK, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, partialK, PT;
+--:-:-:-:1      P2R predsY4, PR, RZ, 0x0f;
+
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1  @P4 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1  @P4 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A4, RZ;
+--:-:-:-:1 @!P1 MOV load0A5, RZ;
+--:-:-:-:1 @!P2 MOV load0A6, RZ;
+--:-:-:-:1 @!P3 MOV load0A7, RZ;
+
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa16, param_m, PT;
+--:-:-:-:1  @P5 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1  @P5 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1A4, [track1A + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1A5, [track1A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1A6, [track1A + 2x<6>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A4, RZ;
+--:-:-:-:1 @!P1 MOV load1A5, RZ;
+--:-:-:-:1 @!P2 MOV load1A6, RZ;
+--:-:-:-:1 @!P3 MOV load1A7, RZ;
+
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb00, param_n, PT;
+--:-:-:-:1  @P6 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1  @P6 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P6 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B4, RZ;
+--:-:-:-:1 @!P1 MOV load0B5, RZ;
+--:-:-:-:1 @!P2 MOV load0B6, RZ;
+--:-:-:-:1 @!P3 MOV load0B7, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb16, param_n, PT;
+--:-:-:-:1  @P4 R2P PR, predsY0, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1  @P4 R2P PR, predsY4, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ, 0x0f;
+<ORDERED>
+--:-:-:-:1  @P0 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];
+--:-:-:-:1  @P1 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B4, RZ;
+--:-:-:-:1 @!P1 MOV load1B5, RZ;
+--:-:-:-:1 @!P2 MOV load1B6, RZ;
+--:-:-:-:1 @!P3 MOV load1B7, RZ;
+    };
++]
+--:-:-:-:1      SHL partialK, partialK, 1;
+
+--:-:-:-:1      ISETP.GE.AND P0, PT, k, 64, PT;
+--:-:-:-:1      IADD k, k, -64;
+--:-:-:-:1  @P0 R2P PR, preds, 0x3c;
+--:-:-:-:1 @!P0 R2P PR, RZ, 0x3c;
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+22:-:-:-:1      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:6:-:1      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+    } : q{
+02:-:-:-:1      F2F.F32.F16 load0A7, load0A7;
+--:-:-:-:1      F2F.F32.F16 load0A6, load0A6;
+--:-:-:-:1      F2F.F32.F16 load0A5, load0A5;
+--:-:6:-:1      F2F.F32.F16 load0A4, load0A4;
+--:-:-:-:1      F2F.F32.F16 load0A3, load0A3;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:2:-:1      F2F.F32.F16 load0A0, load0A0;
+    };
++]
+--:-:-:-:0      IADD   track0A0.CC, track0A0, partialK;
+20:-:-:-:1      STS [writeAs + 4x<7*32 + 0*16>], load0A7;
+--:-:-:-:1      STS [writeAs + 4x<6*32 + 0*16>], load0A6;
+--:-:-:-:1      STS [writeAs + 4x<5*32 + 0*16>], load0A5;
+--:-:-:-:1      STS [writeAs + 4x<4*32 + 0*16>], load0A4;
+02:-:-:-:1      STS [writeAs + 4x<3*32 + 0*16>], load0A3;
+--:-:-:-:1      STS [writeAs + 4x<2*32 + 0*16>], load0A2;
+--:-:-:-:1      STS [writeAs + 4x<1*32 + 0*16>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<0*32 + 0*16>], load0A0;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+04:-:-:-:1      F2F.F32.F16 load1A7, load1A3.H1;
+--:-:-:-:1      F2F.F32.F16 load1A6, load1A3.H0;
+--:-:-:-:1      F2F.F32.F16 load1A5, load1A2.H1;
+--:-:6:-:1      F2F.F32.F16 load1A4, load1A2.H0;
+--:-:-:-:1      F2F.F32.F16 load1A3, load1A1.H1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A1.H0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A0.H1;
+--:-:2:-:1      F2F.F32.F16 load1A0, load1A0.H0;
+    } : q{
+04:-:-:-:1      F2F.F32.F16 load1A7, load1A7;
+--:-:-:-:1      F2F.F32.F16 load1A6, load1A6;
+--:-:-:-:1      F2F.F32.F16 load1A5, load1A5;
+--:-:6:-:1      F2F.F32.F16 load1A4, load1A4;
+--:-:-:-:1      F2F.F32.F16 load1A3, load1A3;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A2;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A1;
+--:-:2:-:1      F2F.F32.F16 load1A0, load1A0;
+    };
++]
+--:-:-:-:0      IADD   track1A0.CC, track1A0, partialK;
+20:-:-:-:1      STS [writeAs + 4x<7*32 + 1*16>], load1A7;
+--:-:-:-:1      STS [writeAs + 4x<6*32 + 1*16>], load1A6;
+--:-:-:-:1      STS [writeAs + 4x<5*32 + 1*16>], load1A5;
+--:-:-:-:1      STS [writeAs + 4x<4*32 + 1*16>], load1A4;
+02:-:-:-:1      STS [writeAs + 4x<3*32 + 1*16>], load1A3;
+--:-:-:-:1      STS [writeAs + 4x<2*32 + 1*16>], load1A2;
+--:-:-:-:1      STS [writeAs + 4x<1*32 + 1*16>], load1A1;
+--:-:-:-:1      STS [writeAs + 4x<0*32 + 1*16>], load1A0;
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+08:-:-:-:1      F2F.F32.F16 load0B7, load0B3.H1;
+--:-:-:-:1      F2F.F32.F16 load0B6, load0B3.H0;
+--:-:-:-:1      F2F.F32.F16 load0B5, load0B2.H1;
+--:-:6:-:1      F2F.F32.F16 load0B4, load0B2.H0;
+--:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
+--:-:2:-:1      F2F.F32.F16 load0B0, load0B0.H0;
+    } : q{
+08:-:-:-:1      F2F.F32.F16 load0B7, load0B7;
+--:-:-:-:1      F2F.F32.F16 load0B6, load0B6;
+--:-:-:-:1      F2F.F32.F16 load0B5, load0B5;
+--:-:6:-:1      F2F.F32.F16 load0B4, load0B4;
+--:-:-:-:1      F2F.F32.F16 load0B3, load0B3;
+--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
+--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
+--:-:2:-:1      F2F.F32.F16 load0B0, load0B0;
+    };
++]
+--:-:-:-:0      IADD   track0B0.CC, track0B0, partialK;
+20:-:-:-:1      STS [writeBs + 4x<7*32 + 0*16>], load0B7;
+--:-:-:-:1      STS [writeBs + 4x<6*32 + 0*16>], load0B6;
+--:-:-:-:1      STS [writeBs + 4x<5*32 + 0*16>], load0B5;
+--:-:-:-:1      STS [writeBs + 4x<4*32 + 0*16>], load0B4;
+02:-:-:-:1      STS [writeBs + 4x<3*32 + 0*16>], load0B3;
+--:-:-:-:1      STS [writeBs + 4x<2*32 + 0*16>], load0B2;
+--:-:-:-:1      STS [writeBs + 4x<1*32 + 0*16>], load0B1;
+--:-:-:-:1      STS [writeBs + 4x<0*32 + 0*16>], load0B0;
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+[+
+    our $vec;
+    return $vec ? q{
+10:-:-:-:1      F2F.F32.F16 load1B7, load1B3.H1;
+--:-:-:-:1      F2F.F32.F16 load1B6, load1B3.H0;
+--:-:-:-:1      F2F.F32.F16 load1B5, load1B2.H1;
+--:-:6:-:1      F2F.F32.F16 load1B4, load1B2.H0;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0.H0;
+    } : q{
+10:-:-:-:1      F2F.F32.F16 load1B7, load1B7;
+--:-:-:-:1      F2F.F32.F16 load1B6, load1B6;
+--:-:-:-:1      F2F.F32.F16 load1B5, load1B5;
+--:-:6:-:1      F2F.F32.F16 load1B4, load1B4;
+--:-:-:-:1      F2F.F32.F16 load1B3, load1B3;
+--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
+--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
+--:-:2:-:1      F2F.F32.F16 load1B0, load1B0;
+    };
++]
+--:-:-:-:0      IADD   track1B0.CC, track1B0, partialK;
+20:-:-:-:1      STS [writeBs + 4x<7*32 + 1*16>], load1B7;
+--:-:-:-:1      STS [writeBs + 4x<6*32 + 1*16>], load1B6;
+--:-:-:-:1      STS [writeBs + 4x<5*32 + 1*16>], load1B5;
+--:-:-:-:1      STS [writeBs + 4x<4*32 + 1*16>], load1B4;
+02:-:-:-:1      STS [writeBs + 4x<3*32 + 1*16>], load1B3;
+--:-:-:-:1      STS [writeBs + 4x<2*32 + 1*16>], load1B2;
+--:-:-:-:1      STS [writeBs + 4x<1*32 + 1*16>], load1B1;
+--:-:-:-:1      STS [writeBs + 4x<0*32 + 1*16>], load1B0;
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32 + 00>];
+--:-:-:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*32 + 16>];
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P3 LDG.E.CI.128 load1A, [track1A];
+--:-:4:-:1  @P4 LDG.E.CI.128 load0B, [track0B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load1B, [track1B];
+    } : q{
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
+--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
+
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A4, [track1A + 2x<4>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A5, [track1A + 2x<5>];
+--:-:-:-:1  @P3 LDG.E.CI.U16 load1A6, [track1A + 2x<6>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>];
+
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];
+--:-:-:-:1  @P4 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];
+--:-:4:-:1  @P4 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];
+
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];
+--:-:-:-:1  @P5 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];
+    };
++]
+
+LOOP:
+
+[+
+    our $vec;
+    our %insert =
+    (
+        j0c8   => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, RZ, PT;\n",
+        j0c10  => "--:-:-:-:1      ISETP.GE.AND P1, PT, k, 64, PT;\n" .
+                  "--:-:-:-:1      IADD k, k, -64;\n",
+
+        j0c23  => "--:-:-:-:1  \@P1 R2P PR, preds, 0x3c;\n",
+        j0c24  => "--:-:-:-:1 \@!P1 R2P PR, RZ,    0x3c;\n",
+
+        j3c32  => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<64>;\n",
+        j3c37  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j4c32  => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 2x<64>;\n",
+        j4c37  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j5c32  => "--:-:-:-:1  \@P4 IADD   track0B0.CC, track0B0, 2x<64>;\n",
+        j5c37  => "--:-:-:-:1  \@P4 IADD.X track0B1,    track0B1, RZ;\n",
+        j6c32  => "--:-:-:-:1  \@P5 IADD   track1B0.CC, track1B0, 2x<64>;\n",
+        j6c37  => "--:-:-:-:1  \@P5 IADD.X track1B1,    track1B1, RZ;\n",
+
+        j6c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j2c45 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n",
+                j2c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n",
+                j2c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n",
+                j2c57 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n",
+                j2c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j3c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j3c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j3c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j3c45 => "04:-:-:-:1  \@P0 F2F.F32.F16 load1A7, load1A3.H1;\n",
+                j3c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A6, load1A3.H0;\n",
+                j3c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A5, load1A2.H1;\n",
+                j3c57 => "--:-:3:-:1  \@P0 F2F.F32.F16 load1A4, load1A2.H0;\n",
+                j3c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A3, load1A1.H1;\n",
+                j4c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A2, load1A1.H0;\n",
+                j4c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A1, load1A0.H1;\n",
+                j4c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load1A0, load1A0.H0;\n",
+
+                j4c45 => "08:-:-:-:1  \@P0 F2F.F32.F16 load0B7, load0B3.H1;\n",
+                j4c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B6, load0B3.H0;\n",
+                j4c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B5, load0B2.H1;\n",
+                j4c57 => "--:-:4:-:1  \@P0 F2F.F32.F16 load0B4, load0B2.H0;\n",
+                j4c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n",
+                j5c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n",
+                j5c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n",
+                j5c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n",
+
+                j5c45 => "10:-:-:-:1  \@P0 F2F.F32.F16 load1B7, load1B3.H1;\n",
+                j5c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B6, load1B3.H0;\n",
+                j5c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B5, load1B2.H1;\n",
+                j5c57 => "--:-:5:-:1  \@P0 F2F.F32.F16 load1B4, load1B2.H0;\n",
+                j5c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n",
+                j6c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n",
+                j6c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n",
+                j6c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n",
+
+                j3c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<7*32 + 0*16>], load0A7;\n",
+                j3c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32 + 0*16>], load0A6;\n",
+                j3c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32 + 0*16>], load0A5;\n",
+                j3c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*32 + 0*16>], load0A4;\n",
+                j3c24  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<3*32 + 0*16>], load0A3;\n",
+                j3c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32 + 0*16>], load0A2;\n",
+                j3c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32 + 0*16>], load0A1;\n",
+                j3c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<0*32 + 0*16>], load0A0;\n",
+
+                j4c16  => "04:-:-:-:1  \@P0 STS [writeAs + 4x<7*32 + 1*16>], load1A7;\n",
+                j4c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32 + 1*16>], load1A6;\n",
+                j4c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32 + 1*16>], load1A5;\n",
+                j4c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*32 + 1*16>], load1A4;\n",
+                j4c24  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<3*32 + 1*16>], load1A3;\n",
+                j4c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32 + 1*16>], load1A2;\n",
+                j4c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32 + 1*16>], load1A1;\n",
+                j4c30  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<0*32 + 1*16>], load1A0;\n",
+
+                j5c16  => "08:-:-:-:1  \@P0 STS [writeBs + 4x<7*32 + 0*16>], load0B7;\n",
+                j5c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*32 + 0*16>], load0B6;\n",
+                j5c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*32 + 0*16>], load0B5;\n",
+                j5c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*32 + 0*16>], load0B4;\n",
+                j5c24  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<3*32 + 0*16>], load0B3;\n",
+                j5c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*32 + 0*16>], load0B2;\n",
+                j5c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*32 + 0*16>], load0B1;\n",
+                j5c30  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<0*32 + 0*16>], load0B0;\n",
+
+                j6c16  => "10:-:-:-:1  \@P0 STS [writeBs + 4x<7*32 + 1*16>], load1B7;\n",
+                j6c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*32 + 1*16>], load1B6;\n",
+                j6c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*32 + 1*16>], load1B5;\n",
+                j6c22  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<4*32 + 1*16>], load1B4;\n",
+                j6c24  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<3*32 + 1*16>], load1B3;\n",
+                j6c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*32 + 1*16>], load1B2;\n",
+                j6c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*32 + 1*16>], load1B1;\n",
+                j6c30  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<0*32 + 1*16>], load1B0;\n",
+
+                j3c62 => "02:-:2:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j4c62 => "04:-:3:-:1  \@P3 LDG.E.CI.128 load1A, [track1A];\n",
+                j5c62 => "08:-:4:-:1  \@P4 LDG.E.CI.128 load0B, [track0B];\n",
+                j6c62 => "10:-:5:-:1  \@P5 LDG.E.CI.128 load1B, [track1B];\n",
+            ) :
+            (
+                j2c45 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A0, load0A0;\n",
+                j2c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A1;\n",
+                j2c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A2;\n",
+                j2c57 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A3, load0A3;\n",
+                j2c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A4;\n",
+                j3c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A5;\n",
+                j3c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A6;\n",
+                j3c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load0A7, load0A7;\n",
+
+                j3c45 => "04:-:-:-:1  \@P0 F2F.F32.F16 load1A0, load1A0;\n",
+                j3c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A1, load1A1;\n",
+                j3c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A2, load1A2;\n",
+                j3c57 => "--:-:3:-:1  \@P0 F2F.F32.F16 load1A3, load1A3;\n",
+                j3c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A4, load1A4;\n",
+                j4c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A5, load1A5;\n",
+                j4c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1A6, load1A6;\n",
+                j4c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load1A7, load1A7;\n",
+
+                j4c45 => "08:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0;\n",
+                j4c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B1;\n",
+                j4c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B2;\n",
+                j4c57 => "--:-:4:-:1  \@P0 F2F.F32.F16 load0B3, load0B3;\n",
+                j4c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B4, load0B4;\n",
+                j5c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B5, load0B5;\n",
+                j5c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B6, load0B6;\n",
+                j5c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load0B7, load0B7;\n",
+
+                j5c45 => "10:-:-:-:1  \@P0 F2F.F32.F16 load1B0, load1B0;\n",
+                j5c49 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B1;\n",
+                j5c53 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B2;\n",
+                j5c57 => "--:-:5:-:1  \@P0 F2F.F32.F16 load1B3, load1B3;\n",
+                j5c61 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B4, load1B4;\n",
+                j6c1  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B5, load1B5;\n",
+                j6c5  => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B6, load1B6;\n",
+                j6c9  => "--:-:6:-:1  \@P0 F2F.F32.F16 load1B7, load1B7;\n",
+
+                j3c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<0*32 + 0*16>], load0A0;\n",
+                j3c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32 + 0*16>], load0A1;\n",
+                j3c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32 + 0*16>], load0A2;\n",
+                j3c22  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<3*32 + 0*16>], load0A3;\n",
+                j3c24  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<4*32 + 0*16>], load0A4;\n",
+                j3c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32 + 0*16>], load0A5;\n",
+                j3c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32 + 0*16>], load0A6;\n",
+                j3c30  => "--:6:-:-:1  \@P0 STS [writeAs + 4x<7*32 + 0*16>], load0A7;\n",
+
+                j4c16  => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*32 + 1*16>], load1A0;\n",
+                j4c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32 + 1*16>], load1A1;\n",
+                j4c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32 + 1*16>], load1A2;\n",
+                j4c22  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<3*32 + 1*16>], load1A3;\n",
+                j4c24  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<4*32 + 1*16>], load1A4;\n",
+                j4c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32 + 1*16>], load1A5;\n",
+                j4c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32 + 1*16>], load1A6;\n",
+                j4c30  => "--:6:-:-:1  \@P0 STS [writeAs + 4x<7*32 + 1*16>], load1A7;\n",
+
+                j5c16  => "08:-:-:-:1  \@P0 STS [writeBs + 4x<0*32 + 0*16>], load0B0;\n",
+                j5c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*32 + 0*16>], load0B1;\n",
+                j5c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*32 + 0*16>], load0B2;\n",
+                j5c22  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<3*32 + 0*16>], load0B3;\n",
+                j5c24  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<4*32 + 0*16>], load0B4;\n",
+                j5c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*32 + 0*16>], load0B5;\n",
+                j5c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*32 + 0*16>], load0B6;\n",
+                j5c30  => "--:6:-:-:1  \@P0 STS [writeBs + 4x<7*32 + 0*16>], load0B7;\n",
+
+                j6c16  => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*32 + 1*16>], load1B0;\n",
+                j6c18  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*32 + 1*16>], load1B1;\n",
+                j6c20  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*32 + 1*16>], load1B2;\n",
+                j6c22  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<3*32 + 1*16>], load1B3;\n",
+                j6c24  => "20:-:-:-:1  \@P0 STS [writeBs + 4x<4*32 + 1*16>], load1B4;\n",
+                j6c26  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<5*32 + 1*16>], load1B5;\n",
+                j6c28  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<6*32 + 1*16>], load1B6;\n",
+                j6c30  => "--:6:-:-:1  \@P0 STS [writeBs + 4x<7*32 + 1*16>], load1B7;\n",
+
+                j3c48 => "02:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j3c50 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j3c52 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j3c54 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+                j3c56 => "20:-:-:-:1  \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n",
+                j3c58 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n",
+                j3c60 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n",
+                j3c62 => "--:-:2:-:1  \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n",
+
+                j4c48 => "04:-:-:-:1  \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n",
+                j4c50 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n",
+                j4c52 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n",
+                j4c54 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n",
+                j4c56 => "20:-:-:-:1  \@P3 LDG.E.CI.U16 load1A4, [track1A + 2x<4>];\n",
+                j4c58 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A5, [track1A + 2x<5>];\n",
+                j4c60 => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A6, [track1A + 2x<6>];\n",
+                j4c62 => "--:-:3:-:1  \@P3 LDG.E.CI.U16 load1A7, [track1A + 2x<7>];\n",
+
+                j5c48 => "08:-:-:-:1  \@P4 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
+                j5c50 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
+                j5c52 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
+                j5c54 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",
+                j5c56 => "20:-:-:-:1  \@P4 LDG.E.CI.U16 load0B4, [track0B + 2x<4>];\n",
+                j5c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B5, [track0B + 2x<5>];\n",
+                j5c60 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B6, [track0B + 2x<6>];\n",
+                j5c62 => "--:-:4:-:1  \@P4 LDG.E.CI.U16 load0B7, [track0B + 2x<7>];\n",
+
+                j6c48 => "10:-:-:-:1  \@P5 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
+                j6c50 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
+                j6c52 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
+                j6c54 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",
+                j6c56 => "20:-:-:-:1  \@P5 LDG.E.CI.U16 load1B4, [track1B + 2x<4>];\n",
+                j6c58 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B5, [track1B + 2x<5>];\n",
+                j6c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B6, [track1B + 2x<6>];\n",
+                j6c62 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load1B7, [track1B + 2x<7>];\n",
+            )
+        ),
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+    );
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+    my $out = '';
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
++]
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// readCs = ((tid & 7) * 4 + (tid / 8) * 32) * 4
+--:-:-:-:1      LOP.AND tid7, tid, 7;
+--:-:-:-:1      SHR.U32 tid8, tid, 3;
+--:-:-:-:1      SHL     tid7, tid7, 2;
+--:-:-:-:1      ISCADD readCs, tid8, tid7, 5;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid7;
+--:-:-:-:1      ISCADD cx, blkB, tid7, 5;
+--:-:-:-:1      IADD   cx1, cx, 1;
+--:-:-:-:1      IADD   cx2, cx, 2;
+--:-:-:-:1      IADD   cx3, cx, 3;
+
+// cy = blkA*32 + tid8
+--:-:-:-:1      ISCADD cy, blkA, tid8, 5;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      SHL  ldc16, ldc, 5;
+
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C0.CC, ci, param_C[0],     1;
+--:-:-:-:1      LEA.HI.X C1,    ci, param_C[1], RZ, 1;
+
+// P0 = cx < n
+--:-:-:-:1      ISETP.LT.AND P0, PT, cx,  param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cx1, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cx2, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cx3, param_n, PT;
+--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
+
+// P4 = cy < m
+--:-:-:-:1      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+// P5 = beta != 0 && P4
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P4;
+
+// P6 = Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+
+// Init beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
+--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
+--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
+--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
+--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
+--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
+--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
+--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32 + 00>], shuffle_x0y0;
+--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
+--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
+--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
+--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32 + 16>], shuffle_x4y0;
+--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
+--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
+--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
+--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32 + 00>], shuffle_x0y1;
+--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
+--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
+--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
+--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32 + 16>], shuffle_x4y1;
+--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
+--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
+--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
+--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*32 + 00>], shuffle_x0y2;
+--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
+--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
+--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
+--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*32 + 16>], shuffle_x4y2;
+--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
+--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
+--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
+--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*32 + 00>], shuffle_x0y3;
+--:-:-:-:1      STS.128 [writeCs+4x<3*32 + 16>], shuffle_x4y3;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
+--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
+--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
+--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
+--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
+--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
+--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32 + 00>], shuffle_x0y4;
+--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
+--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
+--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
+--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<0*32 + 16>], shuffle_x4y4;
+--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
+--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
+--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
+--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32 + 00>], shuffle_x0y5;
+--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
+--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
+--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
+--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<1*32 + 16>], shuffle_x4y5;
+--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
+--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
+--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
+--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*32 + 00>], shuffle_x0y6;
+--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
+--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
+--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
+--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
+--:-:-:-:1      STS.128 [writeCs+4x<2*32 + 16>], shuffle_x4y6;
+--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
+--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
+--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
+--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
+--:-:-:-:4      STS.128 [writeCs+4x<3*32 + 00>], shuffle_x0y7;
+--:-:-:-:1      STS.128 [writeCs+4x<3*32 + 16>], shuffle_x4y7;
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:5      CAL STORE_C;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+[+
+    our $vec;
+    return $vec ? q{
+--:-:1:-:1  @P0 LDG.E.64 loadC, [C];
+    } : q{
+--:-:-:-:0 @!P0 MOV loadC0, RZ;
+--:-:-:-:1  @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>];
+--:-:-:-:0 @!P1 MOV loadC1, RZ;
+--:-:-:-:1  @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>];
+--:-:-:-:0 @!P2 MOV loadC2, RZ;
+--:-:-:-:1  @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>];
+--:-:-:-:0 @!P3 MOV loadC3, RZ;
+--:-:1:-:1  @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>];
+    };
++]
+
+// Restore output preds
+--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;
+
+--:-:-:-:1      LDS.U.128 part0C, [readCs + 4x<0*16*32>];
+--:-:2:-:1      LDS.U.128 part1C, [readCs + 4x<1*16*32>];
+--:-:-:-:1      LDS.U.128 part2C, [readCs + 4x<2*16*32>];
+--:-:3:-:1      LDS.U.128 part3C, [readCs + 4x<3*16*32>];
+--:-:-:-:1      LDS.U.128 part4C, [readCs + 4x<4*16*32>];
+--:-:4:-:1      LDS.U.128 part5C, [readCs + 4x<5*16*32>];
+--:-:-:-:1      LDS.U.128 part6C, [readCs + 4x<6*16*32>];
+--:-:5:-:1      LDS.U.128 part7C, [readCs + 4x<7*16*32>];
+
+<SCHEDULE_BLOCK>
+02:-:-:-:1  @P0 FADD part0C0, part0C0, part1C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part1C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part1C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part1C3;
+
+04:-:-:-:1  @P0 FADD part2C0, part2C0, part3C0;
+--:-:-:-:1  @P1 FADD part2C1, part2C1, part3C1;
+--:-:-:-:1  @P2 FADD part2C2, part2C2, part3C2;
+--:-:-:-:1  @P3 FADD part2C3, part2C3, part3C3;
+
+08:-:-:-:1  @P0 FADD part4C0, part4C0, part5C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part5C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part5C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part5C3;
+
+10:-:-:-:1  @P0 FADD part6C0, part6C0, part7C0;
+--:-:-:-:1  @P1 FADD part6C1, part6C1, part7C1;
+--:-:-:-:1  @P2 FADD part6C2, part6C2, part7C2;
+--:-:-:-:1  @P3 FADD part6C3, part6C3, part7C3;
+
+--:-:-:-:1  @P0 FADD part0C0, part0C0, part2C0;
+--:-:-:-:1  @P1 FADD part0C1, part0C1, part2C1;
+--:-:-:-:1  @P2 FADD part0C2, part0C2, part2C2;
+--:-:-:-:1  @P3 FADD part0C3, part0C3, part2C3;
+
+--:-:-:-:1  @P0 FADD part4C0, part4C0, part6C0;
+--:-:-:-:1  @P1 FADD part4C1, part4C1, part6C1;
+--:-:-:-:1  @P2 FADD part4C2, part4C2, part6C2;
+--:-:-:-:1  @P3 FADD part4C3, part4C3, part6C3;
+
+--:-:-:-:1  @P0 FADD c0, part0C0, part4C0;
+--:-:-:-:1  @P1 FADD c1, part0C1, part4C1;
+--:-:-:-:1  @P2 FADD c2, part0C2, part4C2;
+--:-:-:-:1  @P3 FADD c3, part0C3, part4C3;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:0      IADD cy, cy, 16;
+
+[+
+    our $vec;
+    return $vec ? q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0.H0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC0.H1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC1.H0;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC1.H1;
+    } : q{
+01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0;
+--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC1;
+--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC2;
+--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC3;
+    };
++]
+
+01:-:-:-:1  @P5 FFMA c0, b0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, b1, beta, c1;
+04:-:-:-:1  @P5 FFMA c2, b2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, b3, beta, c3;
+
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:0      ISETP.LT.AND P5, PT, cy, param_m, P5;
+
+--:-:1:-:1  @P0 F2F.F16.F32 c0, c0;
+--:-:2:-:1  @P1 F2F.F16.F32 c1, c1;
+
+--:-:-:-:0      ISETP.LT.AND P4, PT, cy, param_m, PT;
+
+--:-:3:-:1  @P2 F2F.F16.F32 c2, c2;
+--:-:4:-:1  @P3 F2F.F16.F32 c3, c3;
+
+[+
+    our $vec;
+    return $vec ? q{
+03:-:-:-:2  @P0 BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2  @P0 BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:1  @P0 STG.E.CG.64 [C], c;
+    } : q{
+01:-:-:-:1  @P0 STG.E.U16 [C + 2x<0>], c0;
+02:-:-:-:1  @P1 STG.E.U16 [C + 2x<1>], c1;
+04:-:-:-:1  @P2 STG.E.U16 [C + 2x<2>], c2;
+08:1:-:-:1  @P3 STG.E.U16 [C + 2x<3>], c3;
+    };
++]
+
+// Restore beta preds
+--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
+--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
+
+01:-:-:-:6      IADD   C0.CC, C0, ldc16;
+--:-:-:-:0      IADD.X C1,    C1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/hgemm_tn_128x128.sass b/Kernel/SGEMM/Pascal/hgemm_tn_128x128.sass
new file mode 100644
index 0000000..c2beee1
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_tn_128x128.sass
@@ -0,0 +1,360 @@
+# Kernel: hgemm_tn_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+[-
+
+our $int16;
+
+our $convert = $int16 ? 'I2F.F32.S16' : 'F2F.F32.F16';
+
+sub convert_in {return $convert;}
+
+
+sub int16_params {
+    return $int16 ? q{
+param_Stats[0]  : c[0x0][0x190]
+param_Stats[1]  : c[0x0][0x194]
+param_scale     : c[0x0][0x198]
+    } : "";
+}
+-]
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+    [+ int16_params() +]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ lda, ldb, ldaz, ldbz, tid1, tid7, tid31, tid128, tid15, tidX, blk, x<1-3>, y<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-103  : loadA<0-3>, loadB<0-3>
+
+    104-107 : trackA<0-1>, trackB<0-1>
+
+    108-118 ~ writeS, k, txa, txb, tidY, ta, tb, loop
+    119-127 ~ readAs, readBs, tid, blkA, blkB, blkZ
+
+    64-75   ~ ldc, ldcz, ci, xmad_c, tid_31, tid_96, tid_128
+
+    64-79   : c<0-7>, d3, d2, d1, d0, cs<0-3>
+    64-65   : Stats<0-1>
+    80-89   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    90-118  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags, warp_max, maxabs
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV loop, RZ;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+[+
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
++]
+
+--:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+
+// tidX = (tid & 31) << 2
+// tidY = (tid >> 5) & 7
+01:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      SHL     tidX,   tid31, 2;
+--:-:-:-:1      BFE.U32 tidY,   tid,  0x305; // 3 bits at position 5
+
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 4;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// trackA += (blkA*128 + lda*tidY + tidX) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// trackB += (blkB*128 + ldb*tidY + tidX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 7;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      ISCADD  writeS, writeS, 4x<128*8*2>, 2;
+
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    return $vec ? q{
+
+// doLoad = tidY < k && txa|txb < n|m
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P6;
+
+--:-:2:-:1  @P2 LDG.E.CI.64 loadA, [trackA];
+--:-:3:-:1  @P3 LDG.E.CI.64 loadB, [trackB];
+
+--:-:5:-:1 @!P2 LDS.U.64 loadA, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 loadB, [addr_zero];
+
+    // Vec 4 and scalar loads
+    } : q{
+
+// doLoadA = tidY < k && txa < m
+// doLoadB = tidY < k && txb < n
+--:-:-:-:1      IADD x1, txa, 1;
+--:-:-:-:1      IADD x2, txa, 2;
+--:-:-:-:1      IADD x3, txa, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_m, P0;
+
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<00 + 0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<00 + 1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<00 + 2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<00 + 3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      IADD y1, txb, 1;
+--:-:-:-:1      IADD y2, txb, 2;
+--:-:-:-:1      IADD y3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_n, P0;
+
+--:-:3:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>];
+--:-:3:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>];
+--:-:3:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>];
+--:-:3:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+    };
++]
+
+</SCHEDULE_BLOCK>
+
+[+
+    our $vec;
+    our $convert;
+    return $vec ? qq{
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 7;
+
+12:-:-:-:4      $convert loadA3, loadA1.H1;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:4      $convert loadA2, loadA1.H0;
+--:-:-:-:4      $convert loadA1, loadA0.H1;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+--:-:2:-:2      $convert loadA0, loadA0.H0;
+
+02:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadA;
+
+24:-:-:-:4      $convert loadB3, loadB1.H1;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      $convert loadB2, loadB1.H0;
+--:-:-:-:4      $convert loadB1, loadB0.H1;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+--:-:3:-:2      $convert loadB0, loadB0.H0;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+
+04:-:-:-:1      STS.128 [writeS + 4x<8*128>], loadB;
+
+    // scalar loads
+    } : qq{
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+02:-:-:-:4      $convert loadA0, loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:4      $convert loadA1, loadA1;
+--:-:-:-:4      $convert loadA2, loadA2;
+--:-:2:-:2      $convert loadA3, loadA3;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadA0;
+
+04:-:-:-:4      $convert loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      $convert loadB1, loadB1;
+--:-:-:-:4      $convert loadB2, loadB2;
+--:-:3:-:2      $convert loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+04:-:-:-:1      STS.128 [writeS + 4x<8*128>], loadB0;
+
+    };
++]
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+
+[+
+    our $vec;
+    our $convert;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n");
+
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        ($vec ?
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI.64 loadA0, [trackA];\n",
+        j0c13 => "--:-:3:-:1  \@P3 LDG.E.CI.64 loadB0, [trackB];\n",
+
+        j5c1  => "02:-:-:-:1  \@P2 $convert loadA3, loadA1.H1;\n",
+        j5c5  => "--:-:-:-:1  \@P2 $convert loadA2, loadA1;\n",
+        j5c9  => "--:-:-:-:1  \@P2 $convert loadA1, loadA0.H1;\n",
+        j5c13 => "--:-:2:-:1  \@P2 $convert loadA0, loadA0;\n",
+
+        j6c1  => "04:-:-:-:1  \@P3 $convert loadB3, loadB1.H1;\n",
+        j6c5  => "--:-:-:-:1  \@P3 $convert loadB2, loadB1;\n",
+        j6c9  => "--:-:-:-:1  \@P3 $convert loadB1, loadB0.H1;\n",
+        j6c13 => "--:-:3:-:1  \@P3 $convert loadB0, loadB0;\n",
+            ) :
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P2 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n",
+
+        j0c29 => "--:-:3:-:1  \@P3 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j0c31 => "--:-:3:-:1  \@P3 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j0c33 => "--:-:3:-:1  \@P3 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j0c35 => "--:-:3:-:1  \@P3 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j5c1  => "02:-:-:-:1  \@P2 $convert loadA0, loadA0;\n",
+        j5c5  => "--:-:-:-:1  \@P2 $convert loadA1, loadA1;\n",
+        j5c9  => "--:-:-:-:1  \@P2 $convert loadA2, loadA2;\n",
+        j5c13 => "--:-:2:-:1  \@P2 $convert loadA3, loadA3;\n",
+
+        j6c1  => "04:-:-:-:1  \@P3 $convert loadB0, loadB0;\n",
+        j6c5  => "--:-:-:-:1  \@P3 $convert loadB1, loadB1;\n",
+        j6c9  => "--:-:-:-:1  \@P3 $convert loadB2, loadB2;\n",
+        j6c13 => "--:-:3:-:1  \@P3 $convert loadB3, loadB3;\n",
+            )
+        ),
+
+        j5c31 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*128>], loadA;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c31 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<8*128>], loadB;\n",
+
+        j6c46 => "--:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
++]
+
+<INCLUDE file="hgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/hgemm_tn_128x16.sass b/Kernel/SGEMM/Pascal/hgemm_tn_128x16.sass
new file mode 100644
index 0000000..5cd8cce
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_tn_128x16.sass
@@ -0,0 +1,554 @@
+# Kernel: hgemm_tn_128x16
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 16*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    16-17 : Rand<0-1>
+
+    18-47 ~ lda, ldb, ldaz, ldbz, lda8, ldb8, ta, tb, tid1, tid96, tidAX, tidBX, tidY, txa, txb, dimA, flag
+
+    0-15  : czero<00-15>
+
+    3, 2,11,10 : cx<0-3>y0
+    7, 6,15,14 : cx<0-3>y1
+    1, 0, 9, 8 : cx<0-3>y2
+    5, 4,13,12 : cx<0-3>y3
+
+    16-23   : j0Ay<0-3>, j0Bx<0-3>
+    24-31   : j1Ay<0-3>, j1Bx<0-3>
+    32-39   : j2Ay<0-3>, j2Bx<0-3>
+    40-47   : j3Ay<0-3>, j3Bx<0-3>
+
+    48-55   : load0A<0-7>
+    56-63   : load1A<0-7>
+    64-71   : load2A<0-7>
+    72-79   : load3A<0-7>
+
+    80-83   : load<0-3>B
+
+    84-87   : track0A<0-1>, track0B<0-1>
+    88-91   : track1A<0-1>, track1B<0-1>
+    92-95   : track2A<0-1>, track2B<0-1>
+    96-99   : track3A<0-1>, track3B<0-1>
+
+    100-104 ~ writeAs, writeBs, k, lda32, ldb32
+    105-112 ~ readAs, readBs, tid, blkA, blkB, blkZ, tbid, seed
+
+    16-25   : c<0-3>, b<0-1>, d3, d2, d1, d0
+    26-27   : Cy<0-1>
+    28-104  ~ ldc, ldcz, ldc1, writeCs, readCs, tidCX, tidCY, cx, cy, ci, xmad_c, alpha, beta, flags, tid31, lfsr<0-2>, exp<0-3>, rand<0-3>, lfsr<0-2>_1, lfsr<0-2>_2, clk_shf1, clk_shf2
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k, param_k;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+--:-:-:-:1      LDS.U.128 czero00, [addr_zero];
+--:-:-:-:1      LDS.U.128 czero04, [addr_zero];
+--:-:-:-:1      LDS.U.128 czero08, [addr_zero];
+--:-:-:-:1      LDS.U.128 czero12, [addr_zero];
+
+// Grab a seed for this thread
+// (blkB*gridDimA*256 + blkA*256 + tid) & (1024*256 - 1)
+--:-:-:-:1      MOV flag, param_flags;
+--:-:-:-:1      LOP.AND.NZ P4, RZ, flag, 0x1;
+--:-:-:-:1      MOV dimA, gridDimA;
+03:-:-:-:1      ISCADD tbid, blkA, tid, 8;
+04:-:-:-:1      XMAD.U16.U16 dimA, blkB, dimA, RZ;
+--:-:-:-:1      ISCADD tbid, dimA, tbid, 8;
+--:-:-:-:1      LOP.AND seed, tbid, 1x<2048*32 - 1>;
+--:-:-:-:1      LEA      Rand0.CC, seed, param_Rand[0],     0x2;
+--:-:-:-:1      LEA.HI.X Rand1,    seed, param_Rand[1], RZ, 0x2;
+--:-:-:-:1  @P4 LDG.E.CS seed, [Rand];
+
+// tidBX =  tid & 15
+// tidAX = (tid & 15) << 3
+// tidY = (tid >> 4) & 7
+01:-:-:-:1      LOP.AND tidBX, tid,   15;
+--:-:-:-:1      SHL     tidAX, tidBX, 3;
+--:-:-:-:1      BFE.U32 tidY,  tid,   0x304; // 3 bits at position 4
+
+--:-:-:-:1      MOV lda8,   param_lda8;
+--:-:-:-:1      MOV ldb8,   param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda8, 4;
+--:-:-:-:1      SHR.U32 ldb, ldb8, 4;
+--:-:-:-:1      SHL lda32, lda8, 2;
+--:-:-:-:1      SHL ldb32, ldb8, 2;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+
+// trackA += (blkA*128 + lda*tidY + tidAX) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidAX,  7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// trackB += (blkB*16 + ldb*tidY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 4;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      track0B0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X track0B1,    tb, param_B[1], RZ, 0x1;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeAs = (128*tidY + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidAX, 7;
+--:-:-:-:1      SHL    writeAs, writeAs, 2;
+
+// writeBs = (16*tidY + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidBX, 4;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<128*8>, 2;
+
+// Start the read buffers low
+// readAs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+--:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readAs, readAs, tid96;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs  = (((tid & 0x10) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readBs, tid,    0x10;
+--:-:-:-:1      SHR.U32 readBs, readBs, 3;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      IADD   track1A0.CC, track0A0, lda8;
+--:-:-:-:1      IADD.X track1A1,    track0A1, RZ;
+--:-:-:-:1      IADD   track1B0.CC, track0B0, ldb8;
+--:-:-:-:1      IADD.X track1B1,    track0B1, RZ;
+
+--:-:-:-:1      IADD   track2A0.CC, track1A0, lda8;
+--:-:-:-:1      IADD.X track2A1,    track1A1, RZ;
+--:-:-:-:1      IADD   track2B0.CC, track1B0, ldb8;
+--:-:-:-:1      IADD.X track2B1,    track1B1, RZ;
+
+--:-:-:-:1      IADD   track3A0.CC, track2A0, lda8;
+--:-:-:-:1      IADD.X track3A1,    track2A1, RZ;
+--:-:-:-:1      IADD   track3B0.CC, track2B0, ldb8;
+--:-:-:-:1      IADD.X track3B1,    track2B1, RZ;
+
+<ORDERED>
+--:-:3:-:1  @P5 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P6 LDG.E.CI.S16 load0B, [track0B];
+
+--:-:4:-:1  @P5 LDG.E.CI.128 load1A, [track1A];
+--:-:4:-:1  @P6 LDG.E.CI.S16 load1B, [track1B];
+
+--:-:5:-:1  @P5 LDG.E.CI.128 load2A, [track2A];
+--:-:5:-:1  @P6 LDG.E.CI.S16 load2B, [track2B];
+
+--:-:6:-:1  @P5 LDG.E.CI.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.S16 load3B, [track3B];
+</ORDERED>
+
+--:-:-:-:1      ISETP.GE.AND P0, PT, k, 32, PT;
+--:-:-:-:1      ISETP.GT.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GT.AND P4, PT, k, 32, P6;
+--:-:-:-:1      IADD k, k, -32;
+</SCHEDULE_BLOCK>
+
+04:-:-:-:4      F2F.F32.F16 load0A7, load0A3.H1;
+--:-:-:-:4      F2F.F32.F16 load0A6, load0A3.H0;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, lda32;
+--:-:-:-:4      F2F.F32.F16 load0A5, load0A2.H1;
+--:-:1:-:4      F2F.F32.F16 load0A4, load0A2.H0;
+--:-:-:-:0      IADD.X track0A1, track0A1, RZ;
+--:-:-:-:4      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:4      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:0      IADD   track0B0.CC, track0B0, ldb32;
+--:-:-:-:4      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:2:-:4      F2F.F32.F16 load0A0, load0A0.H0;
+--:-:-:-:0      IADD.X track0B1, track0B1, RZ;
+--:-:3:-:1      F2F.F32.F16 load0B, load0B;
+
+01:-:-:-:1      STS.128 [writeAs + 4x<0*(128*8 + 16*8) + 4>], load0A4;
+02:-:-:-:1      STS.128 [writeAs + 4x<0*(128*8 + 16*8) + 0>], load0A0;
+04:-:-:-:1      STS     [writeBs + 4x<0*(128*8 + 16*8) + 0>], load0B;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 0*(128*8 + 16*8)>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*16  + 0*(128*8 + 16*8)>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 0*(128*8 + 16*8)>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*16  + 0*(128*8 + 16*8)>];
+--:-:3:-:1  @P3 LDG.E.CI.128 load0A, [track0A];
+--:-:3:-:1  @P4 LDG.E.CI.S16 load0B, [track0B];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+
+    foreach my $k (0 .. 3)
+    {
+        my $shareBuf = ($k + 1) & 1;
+        my $store   = ($k + 1) & 3;
+        my $loadBar = $store + 3;
+        my $storBar = sprintf '%02x', 1 << ($store + 2);
+
+        %insert =
+        (
+            j0c11 => "$storBar:-:-:-:1  \@P0 F2F.F32.F16 load${store}A7, load${store}A3.H1;\n",
+            j0c15 => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A6, load${store}A3.H0;\n",
+            j1c3  => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A5, load${store}A2.H1;\n",
+            j1c7  => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A4, load${store}A2.H0;\n",
+            j1c11 => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A3, load${store}A1.H1;\n",
+            j1c15 => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A2, load${store}A1.H0;\n",
+            j2c3  => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A1, load${store}A0.H1;\n",
+            j2c7  => "--:-:-:-:1  \@P0 F2F.F32.F16 load${store}A0, load${store}A0.H0;\n",
+            j2c11 => "--:-:$loadBar:-:1  \@P0 F2F.F32.F16 load${store}B, load${store}B;\n",
+
+            j2c12 => "--:-:-:-:1  \@P0 IADD   track${store}A0.CC, track${store}A0, lda32;\n",
+            j3c1  => "--:-:-:-:1  \@P0 IADD.X track${store}A1,    track${store}A1, RZ;\n",
+            j3c3  => "--:-:-:-:1  \@P0 IADD   track${store}B0.CC, track${store}B0, ldb32;\n",
+            j3c8  => "--:-:-:-:1  \@P0 IADD.X track${store}B1,    track${store}B1, RZ;\n",
+
+            j3c9  => "$storBar:-:-:-:1  \@P0 STS.128 [writeAs + 4x<$shareBuf*(128*8 + 16*8) + 0>], load${store}A0;\n",
+            j4c4 => "--:-:-:-:1  \@P0 STS.128 [writeAs + 4x<$shareBuf*(128*8 + 16*8) + 4>], load${store}A4;\n",
+            j4c6 => "--:-:-:-:1  \@P0 STS     [writeBs + 4x<$shareBuf*(128*8 + 16*8) + 0>], load${store}B;\n",
+
+            j5c15 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n",
+
+            j6c1  => "--:-:$loadBar:-:1  \@P3 LDG.E.CI.128 load${store}A, [track${store}A];\n",
+            j6c3  => "--:-:$loadBar:-:1  \@P4 LDG.E.CI.S16 load${store}B, [track${store}B];\n",
+
+            ($k == 3 ?
+                (
+                j0c4  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 32, PT;\n",
+                j0c6  => "--:-:-:-:1      ISETP.GT.AND P3, PT, k, 32, P5;\n",
+                j0c8  => "--:-:-:-:1      ISETP.GT.AND P4, PT, k, 32, P6;\n",
+                j0c10 => "--:-:-:-:1      IADD k, k, -32;\n",
+
+                j7c15 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+                ) : ()
+            ),
+        );
+
+        foreach my $j (0 .. 7)
+        {
+            my $rsPred    = $j >= 6 && $k == 3 ? '@P0' : '   ';
+            my $barrier   = $j & 1 ? 2 : 1;
+            my $loadReg   = ($j + 2) & 3;
+            my $compute   = $j & 3;
+            my $shareLine = ($j + 2) & 7;
+            $shareBuf     = $j >= 6 ? ($k + 1) & 1 : $k & 1;
+
+            $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + %d*(128*8 + 16*8)>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shareBuf;
+            $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*16  + %d*(128*8 + 16*8)>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shareBuf;
+
+            foreach my $c (0 .. 15)
+            {
+                my ($x,$y) = @{$cOrder[$c]};
+
+                my $ins    = $insert{"j${j}c$c"} || '';
+
+                my $wait   = $c == 0 ? "0$barrier" : '--';
+
+                my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+                my $yield  = $c == 8 && $stall ? 'Y' : '-';
+
+                my $ctrl   = "$wait:-:-:$yield:$stall";
+
+                $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+            }
+        }
+        $out .= "\n";
+    }
+    return $out;
+
+</CODE>
+
+//<INCLUDE file="hgemm_common_128x16.sass"/>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 16 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0x1ff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0x1ff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 2;
+
+// tidCX = (tid & 3) << 2
+// tidCY = tid >> 2
+--:-:-:-:1      LOP.AND tid31, tid,   31;
+--:-:-:-:1      LOP.AND tidCX, tid,   3;
+--:-:-:-:1      SHL     tidCX, tidCX, 2;
+--:-:-:-:1      SHR.U32 tidCY, tid,   2;
+
+// readCs = (tidCY*16 + tidCX)   << 2;
+--:-:-:-:1      ISCADD readCs, tidCY, tidCX, 4;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*16 + tidCX;
+--:-:-:-:1      ISCADD cx, blkB, tidCX, 4;
+
+// cy = blkA*128 + tidCY*4
+--:-:-:-:1      SHL     cy, tidCY, 2;
+--:-:-:-:1      ISCADD  cy, blkA,  cy, 7;
+
+// C += (cy*ldc + cx) * 2;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ,  ci;
+--:-:-:-:1      LEA      Cy0.CC, ci, param_C[0],     1;
+--:-:-:-:0      LEA.HI.X Cy1,    ci, param_C[1], RZ, 1;
+
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+// Random Round flag
+--:-:-:-:2      LOP.AND.NZ P4, RZ, flags, 1;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P3, RZ, flags, 2;
+
+--:-:-:-:1      SHL ldc1, ldc, 1;
+
+// Seed the Tausworthe
+--:-:-:-:1      LOP.XOR lfsr0, seed, tbid;
+--:-:-:-:1      CS2R lfsr1, SR_CLOCKLO;
+--:-:-:-:1      CS2R lfsr2, SR_GLOBALTIMERLO;
+--:-:-:-:1      LOP.AND clk_shf1, lfsr1, 31;
+--:-:-:-:1      LOP.AND clk_shf2, lfsr2, 31;
+--:-:-:-:1      LOP.XOR clk_shf1, clk_shf1, tid31;
+--:-:-:-:1      LOP.XOR clk_shf2, clk_shf2, tid31;
+--:-:-:-:1      SHF.R.U64 lfsr1, lfsr1, clk_shf1, lfsr1;
+--:-:-:-:1      SHF.R.U64 lfsr2, lfsr2, clk_shf2, lfsr2;
+--:-:-:-:1      LOP.AND tbid, tbid, 1x<2048*32 - 1>;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..3)
+    {
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:6      LEA      Rand0.CC, tbid, param_Rand[0],     0x2;
+--:-:-:-:1      LEA.HI.X Rand1,    tbid, param_Rand[1], RZ, 0x2;
+--:-:-:-:2      LOP3.LUT seed, lfsr0, lfsr1, lfsr2, 0x96;
+--:-:-:-:1  @P4 STG.E.CS [Rand], seed;
+
+--:-:-:-:5      EXIT;
+
+
+STORE_C:
+
+--:-:-:-:2      ISETP.LT.AND P1, PT, cy, param_m, P5;
+--:-:-:Y:b      ISETP.LT.AND P0, PT, cy, param_m, P6;
+--:-:-:-:0      IADD cy, cy, 1;
+
+--:-:1:-:1  @P1 LDG.E.64 b0, [Cy];
+
+// Apply relu
+--:-:-:-:1  @P3 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P3 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P3 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:4  @P3 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:5:-:1      LDS.U.128 c0, [readCs];
+
+01:-:1:-:4  @P1 F2F.F32.F16 d3, b1.H1;
+--:-:2:-:4  @P1 F2F.F32.F16 d2, b1.H0;
+--:-:3:-:4  @P1 F2F.F32.F16 d1, b0.H1;
+--:-:4:-:1  @P1 F2F.F32.F16 d0, b0.H0;
+
+11:-:-:-:1  @P1 FFMA c3, d3, beta, c3;
+02:-:-:-:1  @P1 FFMA c2, d2, beta, c2;
+04:-:-:-:1  @P1 FFMA c1, d1, beta, c1;
+08:-:-:-:0  @P1 FFMA c0, d0, beta, c0;
+
+--:-:-:-:5  @P4 BRA.U DO_RANDOM1;
+
+--:-:1:-:4      F2F.F16.F32 c0, c0;
+--:-:2:-:4      F2F.F16.F32 c1, c1;
+--:-:3:-:4      F2F.F16.F32 c2, c2;
+--:-:4:-:1      F2F.F16.F32 c3, c3;
+
+--:-:-:-:5      BRA.U END_ROUND1;
+
+DO_RANDOM1:
+
+--:-:-:-:5      CAL RANDOM_ROUND;
+
+END_ROUND1:
+
+// Pack 2 16 bit values into 32 bit words
+03:-:-:-:2      BFI c0, c1, 0x1010, c0;
+0c:-:-:-:2      BFI c1, c3, 0x1010, c2;
+
+--:1:-:-:2  @P0 STG.E.64 [Cy], c0;
+
+01:-:-:-:6      IADD   Cy0.CC, Cy0, ldc1;
+--:-:-:-:0      IADD.X Cy1,    Cy1, RZ;
+
+--:-:-:-:5      RET;
+
+RANDOM_ROUND:
+
+<SCHEDULE_BLOCK>
+
+// Strip mantissa and leave sign+exponent
+--:-:-:-:1      LOP32I.AND exp0, c0, 0xff800000;
+--:-:-:-:1      LOP32I.AND exp1, c1, 0xff800000;
+--:-:-:-:1      LOP32I.AND exp2, c2, 0xff800000;
+--:-:-:-:1      LOP32I.AND exp3, c3, 0xff800000;
+
+// Find the exponent that will shift 32 bits of integer data
+// out past the lsb of this number as an fp16
+// exp *= 2^-10 * 2^-32  (2^-42)
+--:-:-:-:1      FMUL32I exp0, exp0, 0x2a800000;
+--:-:-:-:1      FMUL32I exp1, exp1, 0x2a800000;
+--:-:-:-:1      FMUL32I exp2, exp2, 0x2a800000;
+--:-:-:-:1      FMUL32I exp3, exp3, 0x2a800000;
+
+// lfsr0 = ((lfsr0 & 0xfffffffe) << 12) ^ (((lfsr0 << 13) ^ lfsr0) >> 19);
+--:-:-:-:1      LOP32I.AND lfsr0_1, lfsr0, 0xfffffffe;
+--:-:-:-:1      SHL lfsr0_1, lfsr0_1, 12;
+--:-:-:-:1      SHL lfsr0_2, lfsr0, 13;
+--:-:-:-:1      LOP.XOR lfsr0_2, lfsr0_2, lfsr0;
+--:-:-:-:1      SHR.U32 lfsr0_2, lfsr0_2, 19;
+--:-:-:-:1      LOP.XOR lfsr0, lfsr0_1, lfsr0_2;
+
+// lfsr1 = ((lfsr1 & 0xfffffff8) <<  4) ^ (((lfsr1 << 2)  ^ lfsr1) >> 25);
+--:-:-:-:1      LOP32I.AND lfsr1_1, lfsr1, 0xfffffff8;
+--:-:-:-:1      SHL lfsr1_1, lfsr1_1, 4;
+--:-:-:-:1      SHL lfsr1_2, lfsr1, 2;
+--:-:-:-:1      LOP.XOR lfsr1_2, lfsr1_2, lfsr1;
+--:-:-:-:1      SHR.U32 lfsr1_2, lfsr1_2, 25;
+--:-:-:-:1      LOP.XOR lfsr1, lfsr1_1, lfsr1_2;
+
+// lfsr2 = ((lfsr2 & 0xfffffff0) << 11) ^ (((lfsr2 << 3)  ^ lfsr2) >> 11);
+--:-:-:-:1      LOP32I.AND lfsr2_1, lfsr2, 0xfffffff0;
+--:-:-:-:1      SHL lfsr2_1, lfsr2_1, 11;
+--:-:-:-:1      SHL lfsr2_2, lfsr2, 3;
+--:-:-:-:1      LOP.XOR lfsr2_2, lfsr2_2, lfsr2;
+--:-:-:-:1      SHR.U32 lfsr2_2, lfsr2_2, 11;
+--:-:-:-:1      LOP.XOR lfsr2, lfsr2_1, lfsr2_2;
+
+// rand = lfsr0 ^ lfsr1 ^ lfsr2;
+// generate 3 other rotations of this rand
+--:-:-:-:1      LOP3.LUT  rand0, lfsr0, lfsr1, lfsr2, 0x96;
+--:-:-:-:1      SHF.R.U64 rand1, rand0,  8, rand0;
+--:-:-:-:1      SHF.R.U64 rand2, rand0, 16, rand0;
+--:-:-:-:0      SHF.R.U64 rand3, rand0, 24, rand0;
+//--:-:-:-:1      MOV32I rand0, 0x80000000;
+//--:-:-:-:1      MOV32I rand1, 0x80000000;
+//--:-:-:-:1      MOV32I rand2, 0x80000000;
+//--:-:-:-:1      MOV32I rand3, 0x80000000;
+</SCHEDULE_BLOCK>
+
+// Convert rand to float
+--:-:1:-:4      I2F.F32.U32.RZ rand0, rand0;
+--:-:2:-:4      I2F.F32.U32.RZ rand1, rand1;
+--:-:3:-:4      I2F.F32.U32.RZ rand2, rand2;
+--:-:4:-:1      I2F.F32.U32.RZ rand3, rand3;
+
+// Scale the random number so msb is one below lsb of fp16
+// Add scaled random to number to round
+01:-:-:-:1      FFMA.RZ c0, rand0, exp0, c0;
+02:-:-:-:1      FFMA.RZ c1, rand1, exp1, c1;
+04:-:-:-:1      FFMA.RZ c2, rand2, exp2, c2;
+08:-:-:-:0      FFMA.RZ c3, rand3, exp3, c3;
+
+// Truncate number to fp16
+--:-:1:-:4      F2F.F16.F32.RZ c0, c0;
+--:-:2:-:4      F2F.F16.F32.RZ c1, c1;
+--:-:3:-:4      F2F.F16.F32.RZ c2, c2;
+--:-:4:-:1      F2F.F16.F32.RZ c3, c3;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/hgemm_tn_128x32.sass b/Kernel/SGEMM/Pascal/hgemm_tn_128x32.sass
new file mode 100644
index 0000000..239d5d3
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_tn_128x32.sass
@@ -0,0 +1,553 @@
+# Kernel: hgemm_tn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*16*2 + 32*16*2>
+    szShareA  : 128*16
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 4;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL lda16, lda, 5;
+--:-:-:-:1      SHL ldb16, ldb, 5;
+--:-:-:-:1      SHL lda4,  lda, 2;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX = (tid & 31) << 2
+// tidAY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidAX, tid,   31;
+--:-:-:-:1      SHL     tidAX, tidAX, 2;
+--:-:-:-:1      SHR.U32 tidAY, tid,   5;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += (blkA*128 + tidAX + lda*tidAY) * 4
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO2 ta0, lda,  tidAY, txa;
+08:-:-:-:1      XMAD.LO2 ta0, ldaz, blkZ,  ta0;
+--:-:-:-:1      IADD     ta1, ta0, lda4;
+--:-:-:-:1      IADD     ta2, ta1, lda4;
+--:-:-:-:1      IADD     ta3, ta2, lda4;
+
+--:-:-:-:1      LEA      track0A0.CC, ta0, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track0A1,    ta0, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track1A0.CC, ta1, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track1A1,    ta1, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track2A0.CC, ta2, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track2A1,    ta2, param_A[1], RZ, 1;
+--:-:-:-:1      LEA      track3A0.CC, ta3, param_A[0],     1;
+--:-:-:-:1      LEA.HI.X track3A1,    ta3, param_A[1], RZ, 1;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidBX,  5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 1;
+
+// writeAs = (tidAY*128 + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidAY1, tidAY, 4;
+--:-:-:-:1      IADD tidAY2, tidAY, 8;
+--:-:-:-:1      IADD tidAY3, tidAY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.64 load0A, [track0A];
+--:-:2:-:1  @P1 LDG.E.CI.64 load1A, [track1A];
+--:-:3:-:1  @P2 LDG.E.CI.64 load2A, [track2A];
+--:-:4:-:1  @P3 LDG.E.CI.64 load3A, [track3A];
+--:-:5:-:1  @P4 LDG.E.CI.64 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.64 load0A, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.64 load1A, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.64 load2A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.64 load3A, [addr_zero];
+--:-:6:-:2 @!P4 LDS.U.64 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD txa1,  txa,  1;
+--:-:-:-:1      IADD txa2,  txa,  2;
+--:-:-:-:1      IADD txa3,  txa,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:1:-:1  @P1 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:1:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:1:-:1  @P3 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:2:-:1  @P1 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:2:-:1  @P2 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:2:-:1  @P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];
+--:-:3:-:1  @P1 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];
+--:-:3:-:1  @P2 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];
+--:-:3:-:1  @P3 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];
+--:-:4:-:1  @P1 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];
+--:-:4:-:1  @P2 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];
+--:-:4:-:1  @P3 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];
+--:-:5:-:1  @P1 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];
+--:-:5:-:1  @P2 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];
+--:-:5:-:1  @P3 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 15;
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+21:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
+--:-:1:-:1      F2F.F32.F16 load0A0, load0A0.H0;
+
+02:-:-:-:1      F2F.F32.F16 load1A3, load1A1.H1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A1.H0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A0.H1;
+--:-:2:-:1      F2F.F32.F16 load1A0, load1A0.H0;
+
+04:-:-:-:1      F2F.F32.F16 load2A3, load2A1.H1;
+--:-:-:-:1      F2F.F32.F16 load2A2, load2A1.H0;
+--:-:-:-:1      F2F.F32.F16 load2A1, load2A0.H1;
+--:-:3:-:1      F2F.F32.F16 load2A0, load2A0.H0;
+
+08:-:-:-:1      F2F.F32.F16 load3A3, load3A1.H1;
+--:-:-:-:1      F2F.F32.F16 load3A2, load3A1.H0;
+--:-:-:-:1      F2F.F32.F16 load3A1, load3A0.H1;
+--:-:4:-:1      F2F.F32.F16 load3A0, load3A0.H0;
+
+10:-:-:-:1      F2F.F32.F16 loadB3, loadB1.H1;
+--:-:-:-:1      F2F.F32.F16 loadB2, loadB1.H0;
+--:-:-:-:1      F2F.F32.F16 loadB1, loadB0.H1;
+--:-:5:-:1      F2F.F32.F16 loadB0, loadB0.H0;
+    } : q{
+21:-:-:-:1      F2F.F32.F16 load0A0, load0A0;
+--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
+--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
+--:-:1:-:1      F2F.F32.F16 load0A3, load0A3;
+
+02:-:-:-:1      F2F.F32.F16 load1A0, load1A0;
+--:-:-:-:1      F2F.F32.F16 load1A1, load1A1;
+--:-:-:-:1      F2F.F32.F16 load1A2, load1A2;
+--:-:2:-:1      F2F.F32.F16 load1A3, load1A3;
+
+04:-:-:-:1      F2F.F32.F16 load2A0, load2A0;
+--:-:-:-:1      F2F.F32.F16 load2A1, load2A1;
+--:-:-:-:1      F2F.F32.F16 load2A2, load2A2;
+--:-:3:-:1      F2F.F32.F16 load2A3, load2A3;
+
+08:-:-:-:1      F2F.F32.F16 load3A0, load3A0;
+--:-:-:-:1      F2F.F32.F16 load3A1, load3A1;
+--:-:-:-:1      F2F.F32.F16 load3A2, load3A2;
+--:-:4:-:1      F2F.F32.F16 load3A3, load3A3;
+
+10:-:-:-:1      F2F.F32.F16 loadB0, loadB0;
+--:-:-:-:1      F2F.F32.F16 loadB1, loadB1;
+--:-:-:-:1      F2F.F32.F16 loadB2, loadB2;
+--:-:5:-:1      F2F.F32.F16 loadB3, loadB3;
+    };
+</CODE>
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 16, P1;
+
+01:-:-:-:1      STS.128 [writeAs + 4x<0*128>], load0A;
+--:-:-:-:6      IADD   track0A0.CC, track0A0, lda16;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<4*128>], load1A;
+--:-:-:-:6      IADD   track1A0.CC, track1A0, lda16;
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS.128 [writeAs + 4x<8*128>], load2A;
+--:-:-:-:6      IADD   track2A0.CC, track2A0, lda16;
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS.128 [writeAs + 4x<12*128>], load3A;
+--:-:-:-:6      IADD   track3A0.CC, track3A0, lda16;
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1,    trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P5 LDG.E.CI.64 load0A, [track0A];
+--:-:4:-:1  @P5 LDG.E.CI.64 load1A, [track1A];
+--:-:5:-:1  @P5 LDG.E.CI.64 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.CI.64 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.64 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P5 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
+--:-:3:-:1  @P5 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
+--:-:3:-:1  @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
+--:-:3:-:1  @P5 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
+
+--:-:4:-:1  @P5 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];
+--:-:4:-:1  @P5 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];
+--:-:4:-:1  @P5 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];
+--:-:4:-:1  @P5 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];
+--:-:5:-:1  @P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];
+--:-:6:-:1  @P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 0;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n",
+        j9c6   => "10:5:-:-:1  \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n",
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, lda16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, lda16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2A0.CC, track2A0, lda16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, lda16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.64 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.64 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.64 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.64 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.64 loadB,  [trackB];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0A3, load0A1.H1;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A2, load0A1.H0;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A1, load0A0.H1;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0A0, load0A0.H0;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1A3, load1A1.H1;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A2, load1A1.H0;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A1, load1A0.H1;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1A0, load1A0.H0;\n",
+
+                j6c13  => "10:-:-:-:1  \@P5 F2F.F32.F16 load2A3, load2A1.H1;\n",
+                j6c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2A2, load2A1.H0;\n",
+                j6c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2A1, load2A0.H1;\n",
+                j6c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load2A0, load2A0.H0;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A3, load3A1.H1;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A2, load3A1.H0;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A1, load3A0.H1;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3A0, load3A0.H0;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n",
+                j10c17 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n",
+            ) :
+            (
+
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load1A0, [track1A + 2x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A1, [track1A + 2x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI.U16 load1A2, [track1A + 2x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI.U16 load1A3, [track1A + 2x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.U16 load2A0, [track2A + 2x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2A1, [track2A + 2x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2A2, [track2A + 2x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2A3, [track2A + 2x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A0, [track3A + 2x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A1, [track3A + 2x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3A2, [track3A + 2x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI.U16 load3A3, [track3A + 2x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI.U16 loadB0, [trackB + 2x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadB1, [trackB + 2x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI.U16 loadB2, [trackB + 2x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI.U16 loadB3, [trackB + 2x<3>];\n",
+
+                j2c13  => "04:-:-:-:1  \@P2 F2F.F32.F16 load0A0, load0A0;\n",
+                j2c17  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A1, load0A1;\n",
+                j2c21  => "--:-:-:-:1  \@P2 F2F.F32.F16 load0A2, load0A2;\n",
+                j2c25  => "--:-:3:-:1  \@P2 F2F.F32.F16 load0A3, load0A3;\n",
+
+                j4c13  => "08:-:-:-:1  \@P3 F2F.F32.F16 load1A0, load1A0;\n",
+                j4c17  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A1, load1A1;\n",
+                j4c21  => "--:-:-:-:1  \@P3 F2F.F32.F16 load1A2, load1A2;\n",
+                j4c25  => "--:-:4:-:1  \@P3 F2F.F32.F16 load1A3, load1A3;\n",
+
+                j6c13  => "10:-:-:-:1  \@P5 F2F.F32.F16 load2A0, load2A0;\n",
+                j6c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2A1, load2A1;\n",
+                j6c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load2A2, load2A2;\n",
+                j6c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load2A3, load2A3;\n",
+
+                j8c13  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A0, load3A0;\n",
+                j8c17  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A1, load3A1;\n",
+                j8c21  => "--:-:-:-:1  \@P5 F2F.F32.F16 load3A2, load3A2;\n",
+                j8c25  => "--:-:5:-:1  \@P5 F2F.F32.F16 load3A3, load3A3;\n",
+
+                j10c13 => "20:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB3;\n",
+                j10c17 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB2, loadB2;\n",
+                j10c21 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB1;\n",
+                j10c25 => "--:-:6:-:1  \@P6 F2F.F32.F16 loadB0, loadB0;\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="hgemm_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Pascal/hgemm_tn_128x64.sass b/Kernel/SGEMM/Pascal/hgemm_tn_128x64.sass
new file mode 100644
index 0000000..0404ab5
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/hgemm_tn_128x64.sass
@@ -0,0 +1,389 @@
+# Kernel: hgemm_tn_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tid, blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, ta, tb, tid1, tid15, tidX, x<1-3|65-67>, y<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+     96-107 : loadA<0-7>,  loadB<0-3>
+    108-111 : trackA<0-1>, trackB<0-1>
+
+    112-122 ~ writeAs, writeBs, k, txa00, txa64, txb, tidY, swapBuf
+    123-127 : readAs, readBs
+
+    64-83   ~ ldc, ldcz, ci, xmad_c, threadId, tid31, tid96, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C04y<0-1>, C08y<0-1>, C12y<0-1>, C00y<0-1>
+    86-107  ~ ldc1, ldc4, ldc60, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tidX = (tid & 15) << 2
+// tidY = (tid >> 4) & 7
+01:-:-:-:1      LOP.AND tid15, tid,  15;
+--:-:-:-:1      SHL     tidX, tid15, 2;
+--:-:-:-:1      BFE.U32 tidY,  tid,  0x304; // 3 bits at position 4
+
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 4;
+--:-:-:-:1      SHR.U32 ldb, ldb, 4;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// trackA += (blkA*128 + lda*tidY + tidX) * 2
+02:-:-:-:1      ISCADD   txa00, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa00;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x1;
+--:-:-:-:1      IADD txa64, txa00, 64;
+
+// trackB += (blkB*64 + ldb*tidY + tidX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 6;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x1;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x1;
+
+// Start the write buffers high
+// writeAs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2;
+// writeBs = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2;
+
+// Start the read buffers low
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<64*8 + 128*8>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+// doLoad = tidY < k && txa00|txb < n|m
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY, k, P6;
+
+<ORDERED>
+--:-:2:-:1  @P4 LDG.E.CI.64 loadA0, [trackA + 2x<00>];
+--:-:3:-:1  @P5 LDG.E.CI.64 loadA4, [trackA + 2x<64>];
+--:-:4:-:1  @P6 LDG.E.CI.64 loadB0, [trackB];
+
+--:-:5:-:1 @!P4 LDS.U.64 loadA0, [addr_zero];
+--:-:5:-:1 @!P5 LDS.U.64 loadA4, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.64 loadB0, [addr_zero];
+</ORDERED>
+
+    } : q{
+// doLoadA = tidY < k && txa00 < m
+// doLoadB = tidY < k && txb < n
+--:-:-:-:1      IADD x1,  txa00, 1;
+--:-:-:-:1      IADD x2,  txa00, 2;
+--:-:-:-:1      IADD x3,  txa00, 3;
+--:-:-:-:1      IADD x65, txa64, 1;
+--:-:-:-:1      IADD x66, txa64, 2;
+--:-:-:-:1      IADD x67, txa64, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_m, P0;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI.S16 loadA0, [trackA + 2x<00 + 0>];
+--:-:2:-:1  @P1 LDG.E.CI.S16 loadA1, [trackA + 2x<00 + 1>];
+--:-:2:-:1  @P2 LDG.E.CI.S16 loadA2, [trackA + 2x<00 + 2>];
+--:-:2:-:1  @P3 LDG.E.CI.S16 loadA3, [trackA + 2x<00 + 3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x65, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x66, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x67, param_m, P0;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI.S16 loadA4, [trackA + 2x<00 + 64>];
+--:-:3:-:1  @P1 LDG.E.CI.S16 loadA5, [trackA + 2x<00 + 65>];
+--:-:3:-:1  @P2 LDG.E.CI.S16 loadA6, [trackA + 2x<00 + 66>];
+--:-:3:-:1  @P3 LDG.E.CI.S16 loadA7, [trackA + 2x<00 + 67>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA4, RZ;
+--:-:-:-:1 @!P1 MOV loadA5, RZ;
+--:-:-:-:1 @!P2 MOV loadA6, RZ;
+--:-:-:-:1 @!P3 MOV loadA7, RZ;
+
+--:-:-:-:1      IADD y1, txb, 1;
+--:-:-:-:1      IADD y2, txb, 2;
+--:-:-:-:1      IADD y3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_n, P0;
+
+--:-:4:-:1  @P0 LDG.E.CI.S16 loadB0, [trackB + 2x<00 + 0>];
+--:-:4:-:1  @P1 LDG.E.CI.S16 loadB1, [trackB + 2x<00 + 1>];
+--:-:4:-:1  @P2 LDG.E.CI.S16 loadB2, [trackB + 2x<00 + 2>];
+--:-:4:-:1  @P3 LDG.E.CI.S16 loadB3, [trackB + 2x<00 + 3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+    };
+</CODE>
+
+</SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 7;
+
+12:-:-:-:4      F2F.F32.F16 loadA3, loadA1.H1;
+--:-:-:-:4      F2F.F32.F16 loadA2, loadA1.H0;
+--:-:-:-:4      F2F.F32.F16 loadA1, loadA0.H1;
+--:-:2:-:4      F2F.F32.F16 loadA0, loadA0.H0;
+
+04:-:-:-:4      F2F.F32.F16 loadA7, loadA5.H1;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:4      F2F.F32.F16 loadA6, loadA5.H0;
+--:-:-:-:4      F2F.F32.F16 loadA5, loadA4.H1;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+--:-:3:-:1      F2F.F32.F16 loadA4, loadA4.H0;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<00>], loadA0;
+04:-:-:-:1      STS.128 [writeAs + 4x<64>], loadA4;
+
+28:-:-:-:4      F2F.F32.F16 loadB3, loadB1.H1;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      F2F.F32.F16 loadB2, loadB1.H0;
+--:-:-:-:4      F2F.F32.F16 loadB1, loadB0.H1;
+--:-:2:-:2      F2F.F32.F16 loadB0, loadB0.H0;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+02:-:-:-:1      STS.128 [writeBs], loadB0;
+
+    // scalar loads
+    } : q{
+// bDoRemainder = k > 8
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+02:-:-:-:4      F2F.F32.F16 loadA0, loadA0;
+--:-:-:-:4      F2F.F32.F16 loadA1, loadA1;
+--:-:-:-:4      F2F.F32.F16 loadA2, loadA2;
+--:-:2:-:4      F2F.F32.F16 loadA3, loadA3;
+
+04:-:-:-:4      F2F.F32.F16 loadA4, loadA4;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:4      F2F.F32.F16 loadA5, loadA5;
+--:-:-:-:4      F2F.F32.F16 loadA6, loadA6;
+--:-:3:-:1      F2F.F32.F16 loadA7, loadA7;
+
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<00>], loadA0;
+04:-:-:-:1      STS.128 [writeAs + 4x<64>], loadA4;
+
+08:-:-:-:4      F2F.F32.F16 loadB0, loadB0;
+--:-:-:-:0      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:4      F2F.F32.F16 loadB1, loadB1;
+--:-:-:-:4      F2F.F32.F16 loadB2, loadB2;
+--:-:2:-:2      F2F.F32.F16 loadB3, loadB3;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+02:-:-:-:1      STS.128 [writeBs], loadB0;
+
+    };
+</CODE>
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P4, PT, k, $k_end, P4;\n");
+
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, $k_end, P5;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, $k_end, P6;\n",
+        j0c5  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j0c7  => "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        ($vec ?
+            (
+        j0c8  => "--:-:2:-:1  \@P4 LDG.E.CI.64 loadA0, [trackA + 2x<00>];\n",
+        j0c11 => "--:-:3:-:1  \@P5 LDG.E.CI.64 loadA4, [trackA + 2x<64>];\n",
+        j0c14 => "--:-:4:-:1  \@P6 LDG.E.CI.64 loadB0, [trackB];\n",
+
+        j4c3  => "02:-:-:-:1  \@P4 F2F.F32.F16 loadA3, loadA1.H1;\n",
+        j4c7  => "--:-:-:-:1  \@P4 F2F.F32.F16 loadA2, loadA1.H0;\n",
+        j4c11 => "--:-:-:-:1  \@P4 F2F.F32.F16 loadA1, loadA0.H1;\n",
+        j4c15 => "--:-:2:-:1  \@P4 F2F.F32.F16 loadA0, loadA0.H0;\n",
+
+        j5c3  => "04:-:-:-:1  \@P5 F2F.F32.F16 loadA7, loadA5.H1;\n",
+        j5c7  => "--:-:-:-:1  \@P5 F2F.F32.F16 loadA6, loadA5.H0;\n",
+        j5c11 => "--:-:-:-:1  \@P5 F2F.F32.F16 loadA5, loadA4.H1;\n",
+        j5c15 => "--:-:3:-:1  \@P5 F2F.F32.F16 loadA4, loadA4.H0;\n",
+
+        j6c3  => "08:-:-:-:1  \@P6 F2F.F32.F16 loadB3, loadB1.H1;\n",
+        j6c7  => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB1.H0;\n",
+        j6c11 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB0.H1;\n",
+        j6c15 => "--:-:4:-:1  \@P6 F2F.F32.F16 loadB0, loadB0.H0;\n",
+            ) :
+            (
+        j0c10 => "--:-:2:-:1  \@P4 LDG.E.CI.S16 loadA0, [trackA + 2x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P4 LDG.E.CI.S16 loadA1, [trackA + 2x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P4 LDG.E.CI.S16 loadA2, [trackA + 2x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P4 LDG.E.CI.S16 loadA3, [trackA + 2x<3>];\n",
+
+        j0c33 => "--:-:3:-:1  \@P5 LDG.E.CI.S16 loadA4, [trackA + 2x<64>];\n",
+        j0c35 => "--:-:3:-:1  \@P5 LDG.E.CI.S16 loadA5, [trackA + 2x<65>];\n",
+        j0c37 => "--:-:3:-:1  \@P5 LDG.E.CI.S16 loadA6, [trackA + 2x<66>];\n",
+        j0c39 => "--:-:3:-:1  \@P5 LDG.E.CI.S16 loadA7, [trackA + 2x<67>];\n",
+
+        j1c10 => "--:-:4:-:1  \@P6 LDG.E.CI.S16 loadB0, [trackB + 2x<0>];\n",
+        j1c12 => "--:-:4:-:1  \@P6 LDG.E.CI.S16 loadB1, [trackB + 2x<1>];\n",
+        j1c14 => "--:-:4:-:1  \@P6 LDG.E.CI.S16 loadB2, [trackB + 2x<2>];\n",
+        j1c16 => "--:-:4:-:1  \@P6 LDG.E.CI.S16 loadB3, [trackB + 2x<3>];\n",
+
+        j4c3  => "02:-:-:-:1  \@P4 F2F.F32.F16 loadA0, loadA0;\n",
+        j4c7  => "--:-:-:-:1  \@P4 F2F.F32.F16 loadA1, loadA1;\n",
+        j4c11 => "--:-:-:-:1  \@P4 F2F.F32.F16 loadA2, loadA2;\n",
+        j4c15 => "--:-:2:-:1  \@P4 F2F.F32.F16 loadA3, loadA3;\n",
+
+        j5c3  => "04:-:-:-:1  \@P5 F2F.F32.F16 loadA4, loadA4;\n",
+        j5c7  => "--:-:-:-:1  \@P5 F2F.F32.F16 loadA5, loadA5;\n",
+        j5c11 => "--:-:-:-:1  \@P5 F2F.F32.F16 loadA6, loadA6;\n",
+        j5c15 => "--:-:3:-:1  \@P5 F2F.F32.F16 loadA7, loadA7;\n",
+
+        j6c3  => "08:-:-:-:1  \@P6 F2F.F32.F16 loadB0, loadB0;\n",
+        j6c7  => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB1, loadB1;\n",
+        j6c11 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadB2, loadB2;\n",
+        j6c15 => "--:-:4:-:1  \@P6 F2F.F32.F16 loadB3, loadB3;\n",
+            )
+        ),
+
+        j4c31 => "02:-:-:-:1  \@P0 STS.128 [writeAs + 4x<00>], loadA0;\n",
+        j5c31 => "04:-:-:-:1  \@P0 STS.128 [writeAs + 4x<64>], loadA4;\n",
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c31 => "08:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c46 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j6c54 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="hgemm_common_128x64.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_common_128x128.sass b/Kernel/SGEMM/Pascal/sgemm_common_128x128.sass
new file mode 100644
index 0000000..703af8f
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_common_128x128.sass
@@ -0,0 +1,309 @@
+# sgemm_common_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:1      IADD loop, loop, 1;
+--:-:-:-:1      IADD ta, ta, param_ldaz;
+--:-:-:-:1      IADD tb, tb, param_ldbz;
+--:-:-:-:3      MOV  k, param_k;
+--:-:-:-:1      ISETP.LT.AND P1, PT, loop, param_loops, PT;
+--:-:-:-:6      LEA      trackA0.CC, ta, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 2;
+--:-:-:-:6      LEA      trackB0.CC, tb, param_B[0],     2;
+--:-:-:-:0      LEA.HI.X trackB1,    tb, param_B[1], RZ, 2;
+--:-:-:Y:5  @P1 BRA.U REMAINDER;
+
+--:-:1:-:1      S2R blockA, SR_CTAID.Y;
+--:-:2:-:1      S2R blockB, SR_CTAID.Z;
+--:-:3:-:1      S2R blockZ, SR_CTAID.X;
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      LOP.AND tid_31,  tid, 31;
+--:-:-:-:1      LOP.AND tid_96,  tid, 96;
+--:-:-:-:1      LOP.AND tid_128, tid, 128;
+
+// writeCs = (readAs / 4) * 128 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;
+
+// cx = tid_31 | (tid_128 >> 2);
+--:-:-:-:1      SHR.U32  cx00, tid_128, 2;
+--:-:-:-:1      LOP.OR   cx00, tid_31,  cx00;
+
+// readCs = ((tid_96 << 4) | cx) << 2;
+--:-:-:-:1      SHL      readCs, tid_96,  4;
+--:-:-:-:1      LOP.OR   readCs, readCs, cx00;
+--:-:-:-:1      SHL      readCs, readCs, 2;
+
+// cx += blockB*128;
+02:-:-:-:1      ISCADD  cx00, blockB, cx00, 7;
+--:-:-:-:1      IADD    cx64, cx00, 64;
+
+// cy = blockA*128 + (tid_96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid_96, 1;
+01:-:-:-:1      ISCADD  cy00, blockA, cy00, 7;
+
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00,   cx00, xmad_c;
+04:-:-:-:1      XMAD.LO2 ci, ldcz, blockZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// Apply beta
+--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y + 4x<00>];
+--:-:2:-:1  @P1 LDG.E d1, [C00y + 4x<64>];
+--:-:3:-:1  @P2 LDG.E d2, [C04y + 4x<00>];
+--:-:4:-:1  @P3 LDG.E d3, [C04y + 4x<64>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c4, c4, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c5, c5, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c6, c6, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c7, c7, RZ, !PT;
+
+// beta != 0
+--:-:-:-:7      ISETP.NE.AND P6, PT, beta, RZ, PT;
+
+<ORDERED>
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<64>], c4;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*128 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<0*128 + 64>];
+
+--:-:-:-:1      LDS c2, [readCs + 4x<1*128 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<1*128 + 64>];
+</ORDERED>
+</SCHEDULE_BLOCK>
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:1  @P6 FFMA c3, d3, beta, c3;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, P6;
+
+--:-:-:-:1  @P0 STG.E.CG [C00y0 + 4x<00>], c0;
+--:5:-:-:1  @P1 STG.E.CG [C00y0 + 4x<64>], c1;
+--:-:-:-:1  @P2 STG.E.CG [C04y0 + 4x<00>], c2;
+--:6:-:-:1  @P3 STG.E.CG [C04y0 + 4x<64>], c3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C08y0 + 4x<00>];
+--:-:2:-:1  @P1 LDG.E d1, [C08y0 + 4x<64>];
+--:-:3:-:1  @P2 LDG.E d2, [C12y0 + 4x<00>];
+--:-:4:-:1  @P3 LDG.E d3, [C12y0 + 4x<64>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx64, param_n, PT;
+
+--:-:-:-:2      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:2      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+</SCHEDULE_BLOCK>
+
+10:-:-:-:4      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD   cy08, cy08, 1;
+--:-:-:-:1      IADD   cy12, cy12, 1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+20:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:0      IADD.X C04y1,    C04y1, RZ;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<2*128 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<2*128 + 64>];
+--:-:-:-:1      LDS c2, [readCs + 4x<3*128 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*128 + 64>];
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P6 FFMA c3, d3, beta, c3;
+
+01:-:-:-:1  @P0 STG.E.CG [C08y0 + 4x<00>], c0;
+02:5:-:-:1  @P1 STG.E.CG [C08y0 + 4x<64>], c1;
+04:-:-:-:1  @P2 STG.E.CG [C12y0 + 4x<00>], c2;
+08:6:-:-:1  @P3 STG.E.CG [C12y0 + 4x<64>], c3;
+
+10:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+20:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/sgemm_common_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_common_128x32.sass
new file mode 100644
index 0000000..928ad6b
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_common_128x32.sass
@@ -0,0 +1,240 @@
+# sgemm_common_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+    
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 32 + readBs;
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 3;
+
+// readCs = ((tid & 96) << 2) | (tid & 31)   << 2;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 2;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid31;
+--:-:-:-:1      ISCADD cx, blkB, tid31, 5;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;
+
+// Apply relu
+--:-:-:-:0      LOP.AND.NZ   P4, RZ, flags, 2;
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6; 
+
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:4      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      MOV d0, RZ;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:4      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      MOV d1, RZ;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:3      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      MOV d2, RZ;
+--:-:-:-:1      MOV d3, RZ;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y];
+--:-:2:-:1  @P1 LDG.E d1, [C04y];
+--:-:3:-:1  @P2 LDG.E d2, [C08y];
+--:-:4:-:1  @P3 LDG.E d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:3      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*32>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*32>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*32>];
+</SCHEDULE_BLOCK>
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P5 FFMA c3, d3, beta, c3;
+
+--:1:-:-:1  @P0 STG.E.CG [C00y], c0;
+--:2:-:-:1  @P1 STG.E.CG [C04y], c1;
+--:3:-:-:1  @P2 STG.E.CG [C08y], c2;
+--:4:-:-:1  @P3 STG.E.CG [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/sgemm_common_128x64.sass b/Kernel/SGEMM/Pascal/sgemm_common_128x64.sass
new file mode 100644
index 0000000..ee1705e
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_common_128x64.sass
@@ -0,0 +1,290 @@
+# sgemm_common_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 +  00>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
+--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 +  32>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2,4,6)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 7)
+    {
+        my $odd      = $j & 1;
+        my $nOdd     = !$odd + 0;
+        my $rsOffset = ($j + 1) % 8;
+        my $rsPred   = $j == 7 ? '@P0' : '   ';
+
+        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 +  00>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
+        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 +  32>];\n", $rsPred, $nOdd, $rsOffset;
+
+        foreach my $c (0 .. 63)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? '01' : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 32 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+--:-:1:-:1      S2R tid_2,  SR_TID.X;
+--:-:2:-:1      S2R blockA, SR_CTAID.Y;
+--:-:3:-:1      S2R blockB, SR_CTAID.Z;
+--:-:4:-:1      S2R blockZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 64 + readBs;
+--:-:-:-:1      LOP.AND readAs, readAs, 0xff;
+--:-:-:-:1      LOP.AND readBs, readBs, 0xff;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 4;
+
+// readCs = ((tid_2 & 96) << 3) | (tid_2 & 31)   << 2;
+01:-:-:-:1      LOP.AND tid31, tid_2, 31;
+01:-:-:-:1      LOP.AND tid96, tid_2, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 3;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx00 = blockB*64 + tid31;
+04:-:-:-:1      ISCADD cx00, blockB, tid31, 6;
+--:-:-:-:1      IADD   cx32, cx00, 32;
+
+// cy = blockA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+02:-:-:-:1      ISCADD  cy00, blockA, cy00, 7;
+
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00,   cx00, xmad_c;
+08:-:-:-:1      XMAD.LO2 ci, ldcz, blockZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;
+
+
+--:-:-:-:1      ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
+            ($y) x 8);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y0 + 4x<00>];
+--:-:2:-:1  @P1 LDG.E d1, [C00y0 + 4x<32>];
+--:-:3:-:1  @P2 LDG.E d2, [C04y0 + 4x<00>];
+--:-:4:-:1  @P3 LDG.E d3, [C04y0 + 4x<32>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, param_m, P5;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;
+--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c4, c4, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c5, c5, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c6, c6, RZ, !PT;
+--:-:-:-:1  @P6 FMNMX c7, c7, RZ, !PT;
+
+--:-:-:-:7      ISETP.NE.AND P6, PT, beta, RZ, PT;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1      STS.128 [writeCs+4x<00>], c0;
+--:-:-:-:1      STS.128 [writeCs+4x<32>], c4;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<0*64 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<0*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<1*64 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<1*64 + 32>];
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:1  @P6 FFMA c3, d3, beta, c3;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1  @P0 STG.E.CS [C00y0 + 4x<00>], c0;
+--:5:-:-:1  @P1 STG.E.CS [C00y0 + 4x<32>], c1;
+--:-:-:-:1  @P2 STG.E.CS [C04y0 + 4x<00>], c2;
+--:6:-:-:1  @P3 STG.E.CS [C04y0 + 4x<32>], c3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, P6;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C08y0 + 4x<00>];
+--:-:2:-:1  @P1 LDG.E d1, [C08y0 + 4x<32>];
+--:-:3:-:1  @P2 LDG.E d2, [C12y0 + 4x<00>];
+--:-:4:-:1  @P3 LDG.E d3, [C12y0 + 4x<32>];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, cx00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, cx32, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+</SCHEDULE_BLOCK>
+
+10:-:-:-:2      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD   cy00, cy00, 1;
+--:-:-:-:1      IADD   cy04, cy04, 1;
+--:-:-:-:1      IADD   cy08, cy08, 1;
+--:-:-:-:1      IADD   cy12, cy12, 1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+20:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:0      IADD.X C04y1,    C04y1, RZ;
+
+--:-:-:-:1      LDS c0, [readCs + 4x<2*64 + 00>];
+--:-:5:-:1      LDS c1, [readCs + 4x<2*64 + 32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<3*64 + 00>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*64 + 32>];
+
+11:-:-:-:1  @P6 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P6 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P6 FFMA c2, d2, beta, c2;
+08:-:-:-:1  @P6 FFMA c3, d3, beta, c3;
+
+--:-:-:-:1  @P0 STG.E.CS [C08y0 + 4x<00>], c0;
+--:5:-:-:1  @P1 STG.E.CS [C08y0 + 4x<32>], c1;
+--:-:-:-:1  @P2 STG.E.CS [C12y0 + 4x<00>], c2;
+--:6:-:-:1  @P3 STG.E.CS [C12y0 + 4x<32>], c3;
+
+10:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+20:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/sgemm_common_32x128.sass b/Kernel/SGEMM/Pascal/sgemm_common_32x128.sass
new file mode 100644
index 0000000..da4d83d
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_common_32x128.sass
@@ -0,0 +1,234 @@
+# Kernel: hgemm_common_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32  + 16 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*32  + 16 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+    
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+    
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32  + 16 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+
+// writeCs = (readAs / 4) * 128 + readBs;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 5;
+
+// readCs = tid * 4;
+--:-:-:-:1      SHL readCs, tid, 2;
+
+// cx = blkB*128 + tid;
+--:-:-:-:1      ISCADD cx, blkB, tid, 7;
+
+// cy = blkA*32
+--:-:-:-:1      SHL cy00, blkA, 5;
+
+// C += (cy*ldc + cx) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+
+--:-:-:-:1      XMAD.LO  ci, cy00, ldc, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, param_C[0],     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, param_C[1], RZ, 2;
+
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+// Apply relu
+--:-:-:-:1      LOP.AND.NZ P4, RZ, flags, 2;
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc12, ldc, -ldc4, 6;
+
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:5      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:5      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:0      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  12;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  12;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  12;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc12;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  12;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n"  if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y];
+--:-:2:-:1  @P1 LDG.E d1, [C04y];
+--:-:3:-:1  @P2 LDG.E d2, [C08y];
+--:-:4:-:1  @P3 LDG.E d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:3      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*128>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*128>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*128>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*128>];
+</SCHEDULE_BLOCK>
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:0  @P5 FFMA c3, d3, beta, c3;
+
+--:1:-:-:1  @P0 STG.E.CG [C00y], c0;
+--:2:-:-:1  @P1 STG.E.CG [C04y], c1;
+--:3:-:-:1  @P2 STG.E.CG [C08y], c2;
+--:4:-:-:1  @P3 STG.E.CG [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:0      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_128x128.sass b/Kernel/SGEMM/Pascal/sgemm_nn_128x128.sass
new file mode 100644
index 0000000..22b8782
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_nn_128x128.sass
@@ -0,0 +1,327 @@
+# Kernel: sgemm_nn_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ blkA, blkB, blkZ, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, xmad_ta, xmad_tb, tid31, tid128
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   ~ k<1-3>, x<1-3>
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-107  : loadA<0-7>, loadB<0-3>
+
+    108-111 : trackA<0-1>, trackB<0-1>
+
+    112-121 ~ writeAs, writeBs, k, txb, tidAY, tidBY, ta, tb, loop
+    122-127 ~ readAs, readBs, tid
+
+    64-75   ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-121  ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,   param_k;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      MOV loop, RZ;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+
+// tidAY  = (tid & 1) << 2
+01:-:-:-:1      LOP.AND tid1,  tid,  1;
+--:-:-:-:1      SHL     tidAY, tid1, 2;
+
+// tidAX = tid >> 1
+--:-:-:-:1      SHR.U32 tidAX, tid, 1;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ,  ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5) & 7
+--:-:-:-:1      LOP.AND tid31, tid,  31;
+--:-:-:-:1      SHL     tidBX, tid31, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,  0x305; // 3 bits at position 5
+
+// trackB += (blkB*128 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 7;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeAs = 4 * (128 * tidAY + tidAX)
+--:-:-:-:1      ISCADD  writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD  writeAs, writeAs, 4x<128*8*2>, 2;
+
+// writeBs = (128*tidBY + tidBX) * 4
+--:-:-:-:1      ISCADD  writeBs, tidBY, tidBX, 7;
+--:-:-:-:1      ISCADD  writeBs, writeBs, 4x<128*8*3>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid128, tid,    128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+
+// k must be multiple of 8
+--:-:1:-:1  @P6 LDG.E.CI.128 loadB0, [trackB];
+
+--:-:2:-:1  @P5 LDG.E.CI.128 loadA0, [trackA + 4x<0>];
+--:5:6:-:1  @P5 LDG.E.CI.128 loadA4, [trackA + 4x<8>];
+
+--:-:3:-:1 @!P6 LDS.U.128 loadB0, [addr_zero];
+--:-:4:-:1 @!P5 LDS.U.128 loadA0, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.128 loadA4, [addr_zero];
+
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+05:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+0a:-:-:-:1      STS [writeAs + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeAs + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*128>], loadA3;
+
+10:-:-:-:6      IADD   trackA0.CC, trackA0, 4x<16>;
+--:-:-:-:1      IADD.X trackA1, trackA1, RZ;
+
+    } : q{
+
+<SCHEDULE_BLOCK>
+
+// doLoad0 = tidBY < k
+--:-:-:-:1      IADD x1, txb, 1;
+--:-:-:-:1      IADD x2, txb, 2;
+--:-:-:-:1      IADD x3, txb, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_n, P0;
+
+--:-:6:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+
+--:-:-:-:1      IADD k1, tidAY, 1;
+--:-:-:-:1      IADD k2, tidAY, 2;
+--:-:-:-:1      IADD k3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:2:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+</SCHEDULE_BLOCK>
+
+// bDoRemainder = k > 8
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+20:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128>], loadA0;
+04:-:-:-:1      STS [writeAs + 4x<1*128>], loadA1;
+08:-:-:-:1      STS [writeAs + 4x<2*128>], loadA2;
+10:-:-:-:1      STS [writeAs + 4x<3*128>], loadA3;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, 4x<8>;
+--:-:-:-:1      IADD.X trackA1, trackA1, RZ;
+    };
+</CODE>
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      LOP.XOR writeAs, writeAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR writeBs, writeBs, 4x<128*8*2>;
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND  P0, PT, k, $k_end, PT;\n",
+        j0c15 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P5;\n",
+
+        j0c10 => "--:-:2:-:1  \@P3 LDG.E.CI.128 loadB0, [trackB];\n",
+
+        j0c28 => "--:-:3:-:1  \@P2 LDG.E.CI.128 loadA0, [trackA + 4x<0>];\n",
+        j0c30 => "20:5:6:-:1  \@P2 LDG.E.CI.128 loadA4, [trackA + 4x<8>];\n",
+
+        j4c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128>], loadA4;\n",
+        j4c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128>], loadA5;\n",
+        j4c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128>], loadA6;\n",
+        j4c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128>], loadA7;\n",
+
+        j5c35 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c29 => "04:-:-:-:1  \@P1 STS [writeAs + 4x<0*128>], loadA0;\n",
+        j6c31 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j6c33 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j6c35 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<3*128>], loadA3;\n",
+
+        j6c46 => "10:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 4x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n",
+        j0c8  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        j0c10 => "--:-:6:-:1  \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+        j0c29 => "--:-:6:-:1  \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+        j0c31 => "--:-:6:-:1  \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+        j0c33 => "--:-:6:-:1  \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+
+        j0c35 => "--:-:2:-:1  \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+        j1c29 => "--:-:3:-:1  \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+        j1c31 => "--:-:4:-:1  \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+        j1c33 => "--:-:5:-:1  \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j5c39 => "20:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c29 => "02:-:-:-:1  \@P0 STS [writeAs + 4x<0*128>], loadA0;\n",
+        j6c31 => "04:-:-:-:1  \@P0 STS [writeAs + 4x<1*128>], loadA1;\n",
+        j6c33 => "08:-:-:-:1  \@P0 STS [writeAs + 4x<2*128>], loadA2;\n",
+        j6c35 => "10:-:-:-:1  \@P0 STS [writeAs + 4x<3*128>], loadA3;\n",
+
+        j6c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 4x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+            )
+        ),
+
+        j5c46 => "--:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j5c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:0      IADD32I k, k, -8;\n" .
+                 "--:-:-:-:5      BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeAs, writeAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeBs, writeBs, 4x<128*8*2>;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_nn_128x32.sass
new file mode 100644
index 0000000..8194777
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_nn_128x32.sass
@@ -0,0 +1,485 @@
+# Kernel: sgemm_nn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + 32*16*2>
+    szShareA  : (128*16 + 32)
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, shiftAX
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+--:-:-:-:1      SHL lda32, lda, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD txa00, blkA, tidAX, 7;
+--:-:-:-:1      IADD   txa32, txa00, 32;
+--:-:-:-:1      IADD   txa64, txa00, 64;
+--:-:-:-:1      IADD   txa96, txa00, 96;
+
+--:-:-:-:1      XMAD.LO  ta00, lda,  txa00,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta00, ldaz, blkZ,  ta00;
+--:-:-:-:1      IADD     ta32, ta00, lda32;
+--:-:-:-:1      IADD     ta64, ta32, lda32;
+--:-:-:-:1      IADD     ta96, ta64, lda32;
+
+--:-:-:-:1      LEA      track0A0.CC, ta00, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta00, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, ta32, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track1A1,    ta32, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track2A0.CC, ta64, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track2A1,    ta64, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track3A0.CC, ta96, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track3A1,    ta96, param_A[1], RZ, 2;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 2;
+
+// writeAs = (tidAY*128 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:2:-:1  @P3 LDG.E.CI.128 load1A, [track1A];
+--:-:3:-:1  @P4 LDG.E.CI.128 load2A, [track2A];
+--:-:4:-:1  @P5 LDG.E.CI.128 load3A, [track3A];
+--:-:5:-:1  @P6 LDG.E.CI.128 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load1A, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128 load2A, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 load3A, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.128 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI load0A0, [track0A + 4x<0>];
+--:-:1:-:1  @P1 LDG.E.CI load0A1, [track0A + 4x<1>];
+--:-:1:-:1  @P2 LDG.E.CI load0A2, [track0A + 4x<2>];
+--:-:1:-:1  @P3 LDG.E.CI load0A3, [track0A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI load1A0, [track1A + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI load1A1, [track1A + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI load1A2, [track1A + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI load1A3, [track1A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI load2A0, [track2A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI load2A1, [track2A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load2A2, [track2A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI load2A3, [track2A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI load3A0, [track3A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI load3A1, [track3A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI load3A2, [track3A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load3A3, [track3A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:5:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:5:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P1, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS [writeAs + 4x<0*128 + 0*32>], load0A0;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 0*32>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 0*32>], load0A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 0*32>], load0A3;
+
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128 + 1*32>], load1A0;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 1*32>], load1A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 1*32>], load1A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 1*32>], load1A3;
+
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 2*32>], load2A0;
+--:-:-:-:0      IADD   track2A0.CC, track2A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 2*32>], load2A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 2*32>], load2A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 2*32>], load2A3;
+
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 3*32>], load3A0;
+--:-:-:-:0      IADD   track3A0.CC, track3A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 3*32>], load3A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 3*32>], load3A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 3*32>], load3A3;
+
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
+--:-:4:-:1  @P3 LDG.E.CI.128 load1A, [track1A];
+--:-:5:-:1  @P4 LDG.E.CI.128 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.CI.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.128 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI load0A0, [track0A + 4x<0>];
+--:-:3:-:1  @P2 LDG.E.CI load0A1, [track0A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load0A2, [track0A + 4x<2>];
+--:-:3:-:1  @P2 LDG.E.CI load0A3, [track0A + 4x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI load1A0, [track1A + 4x<0>];
+--:-:4:-:1  @P3 LDG.E.CI load1A1, [track1A + 4x<1>];
+--:-:4:-:1  @P3 LDG.E.CI load1A2, [track1A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load1A3, [track1A + 4x<3>];
+
+--:-:5:-:1  @P4 LDG.E.CI load2A0, [track2A + 4x<0>];
+--:-:5:-:1  @P4 LDG.E.CI load2A1, [track2A + 4x<1>];
+--:-:5:-:1  @P4 LDG.E.CI load2A2, [track2A + 4x<2>];
+--:-:5:-:1  @P4 LDG.E.CI load2A3, [track2A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load3A0, [track3A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load3A1, [track3A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load3A2, [track3A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load3A3, [track3A + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P6 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P6 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P6 LDG.E.CI loadB3, [trackB + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n",
+
+        j9c6   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n",
+
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 4x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 4x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2A0.CC, track2A0, 4x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, 4x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.128 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI.128 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.128 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.128 loadB,  [trackB];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI load2A0, [track2A + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P4 LDG.E.CI load2A1, [track2A + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P4 LDG.E.CI load2A2, [track2A + 4x<2>];\n",
+                j10c3  => "--:-:5:-:1  \@P4 LDG.E.CI load2A3, [track2A + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_128x64.sass b/Kernel/SGEMM/Pascal/sgemm_nn_128x64.sass
new file mode 100644
index 0000000..2fca939
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_nn_128x64.sass
@@ -0,0 +1,414 @@
+# Kernel: sgemm_nn_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ tid, blkA, blkB, blkZ, txb, tidAY, tidBY, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, ta, xmad_ta, tb, tid15, xmad_tb, k<1-3>, x<1-3>
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-115  : loadAA<0-7>, loadA<0-7>, loadB<0-3>
+
+    116-121 : track0A<0-1>, track1A<0-1>, trackB<0-1>
+
+    122-125 ~ writeAs, writeBs, k, swapBuf
+    126-127 ~ readAs, readBs
+
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-125  ~ tid_2, blockA, blockB, blockZ, ldc, ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, ci, xmad_c, alpha, beta, flags, tid31, tid96
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,   param_k;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+// tidAY = (tid & 1) << 2
+// tidAX = tid >> 1
+01:-:-:-:1      LOP.AND tid1,  tid,  1;
+--:-:-:-:1      SHL     tidAY, tid1, 2;
+01:-:-:-:1      SHR.U32 tidAX, tid,  1;
+
+// trackA += 4 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD  txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO  ta, lda,  txa,   tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ,  ta;
+--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, lda, track0A0,      8;
+--:-:-:-:1      LEA.HI.X track1A1,    lda, track0A1, RZ,  8;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa, param_m, PT;
+--:-:-:-:1      IADD txa, txa, 64;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// tidBX = (tid & 15) << 2
+// tidBY = (tid >> 4) & 7
+--:-:-:-:1      LOP.AND tid15, tid,   15;
+--:-:-:-:1      SHL     tidBX, tid15, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,   0x304; // 3 bits at position 4
+
+// trackB += (blkB*64 + tidX + ldb*tidBY) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x2;
+--:-:-:-:2      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// Start the write buffers high
+// writeAs = (128*tidAY + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2;
+
+// writeBs = (64*tidBY + tidX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2;
+
+// Start the read buffers low
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<64*8 + 128*8>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+
+// k must be multiple of 8
+--:-:2:-:1  @P6 LDG.E.CI.128 loadB0, [trackB];
+
+--:-:3:-:1  @P4 LDG.E.CI.128 loadA0,  [track0A + 4x<0>];
+--:-:3:-:1  @P4 LDG.E.CI.128 loadAA0, [track0A + 4x<8>];
+
+--:-:4:-:1  @P5 LDG.E.CI.128 loadA4,  [track1A + 4x<0>];
+--:-:4:-:1  @P5 LDG.E.CI.128 loadAA4, [track1A + 4x<8>];
+
+--:-:-:-:1 @!P6 LDS.U.128 loadB0, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128 loadA0, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 loadA4, [addr_zero];
+--:-:-:-:1 @!P4 LDS.U.128 loadAA0, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.128 loadAA4, [addr_zero];
+
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+22:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 00>], loadA0;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 00>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 00>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*128 + 00>], loadA3;
+
+--:-:-:-:6      IADD   track0A0.CC, track0A0, 4x<16>;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 64>], loadA4;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 64>], loadA5;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 64>], loadA6;
+--:-:-:-:1      STS [writeAs + 4x<3*128 + 64>], loadA7;
+
+--:-:-:-:6      IADD   track1A0.CC, track1A0, 4x<16>;
+--:-:-:-:1      IADD.X track1A1,    track1A1, RZ;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;
+
+    } : q{
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+<SCHEDULE_BLOCK>
+01:-:-:-:1      LOP.AND tid1,  tid,   1;
+--:-:-:-:1      SHL     tidAY, tid1,  2;
+--:-:-:-:1      LOP.AND tid15, tid,   15;
+--:-:-:-:1      SHL     tidBX, tid15, 2;
+--:-:-:-:1      BFE.U32 tidBY, tid,   0x304; // 3 bits at position 4
+02:-:-:-:1      ISCADD  txb, blkB, tidBX, 6;
+
+// doLoad0 = tidBY < k
+--:-:-:-:1      IADD x1, txb, 1;
+--:-:-:-:1      IADD x2, txb, 2;
+--:-:-:-:1      IADD x3, txb, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_n, P0;
+
+--:-:2:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      IADD k1, tidAY, 1;
+--:-:-:-:1      IADD k2, tidAY, 2;
+--:-:-:-:1      IADD k3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P4;
+
+--:-:3:-:1  @P0 LDG.E.CI loadA0, [track0A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI loadA1, [track0A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI loadA2, [track0A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI loadA3, [track0A + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:4:-:1  @P0 LDG.E.CI loadA4, [track1A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI loadA5, [track1A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI loadA6, [track1A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI loadA7, [track1A + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA4, RZ;
+--:-:-:-:1 @!P1 MOV loadA5, RZ;
+--:-:-:-:1 @!P2 MOV loadA6, RZ;
+--:-:-:-:1 @!P3 MOV loadA7, RZ;
+</SCHEDULE_BLOCK>
+
+02:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 00>], loadA0;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 00>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 00>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*128 + 00>], loadA3;
+
+--:-:-:-:6      IADD   track0A0.CC, track0A0, 4x<8>;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 64>], loadA4;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 64>], loadA5;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 64>], loadA6;
+--:-:-:-:1      STS [writeAs + 4x<3*128 + 64>], loadA7;
+
+--:-:-:-:6      IADD   track1A0.CC, track1A0, 4x<8>;
+--:-:-:-:1      IADD.X track1A1,    track1A1, RZ;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, P6;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+
+        j0c11 => "--:-:2:-:1  \@P0 LDG.E.CI.128 loadB0, [trackB];\n",
+
+        j0c12 => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j0c13 => "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j0c23 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P4;\n",
+        j0c24 => "--:-:-:-:1      PSETP.AND.AND P3, PT, P0, P1, P5;\n",
+
+        j0c35 => "--:-:3:-:1  \@P2 LDG.E.CI.128 loadA0,  [track0A + 4x<0>];\n",
+        j0c37 => "--:-:3:-:1  \@P2 LDG.E.CI.128 loadAA0, [track0A + 4x<8>];\n",
+
+        j0c39 => "--:-:4:-:1  \@P3 LDG.E.CI.128 loadA4,  [track1A + 4x<0>];\n",
+        j0c41 => "10:6:5:-:1  \@P3 LDG.E.CI.128 loadAA4, [track1A + 4x<8>];\n",
+
+        j2c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 00>], loadAA0;\n",
+        j2c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 00>], loadAA1;\n",
+        j2c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 00>], loadAA2;\n",
+        j2c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 00>], loadAA3;\n",
+
+        j3c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 64>], loadAA4;\n",
+        j3c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 64>], loadAA5;\n",
+        j3c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 64>], loadAA6;\n",
+        j3c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 64>], loadAA7;\n",
+
+        j5c29 => "04:-:-:-:1  \@P1 STS [writeAs + 4x<0*128 + 00>], loadA0;\n",
+        j5c31 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<1*128 + 00>], loadA1;\n",
+        j5c33 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<2*128 + 00>], loadA2;\n",
+        j5c35 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<3*128 + 00>], loadA3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 4x<16>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+
+        j6c29 => "08:-:-:-:1  \@P1 STS [writeAs + 4x<0*128 + 64>], loadA4;\n",
+        j6c31 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<1*128 + 64>], loadA5;\n",
+        j6c33 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<2*128 + 64>], loadA6;\n",
+        j6c35 => "--:2:-:-:1  \@P1 STS [writeAs + 4x<3*128 + 64>], loadA7;\n",
+
+        j6c46 => "20:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 4x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P4;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P5;\n",
+
+        j0c10 => "--:-:2:-:1  \@P0 LDG.E.CS loadB0, [trackB + 4x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P0 LDG.E.CS loadB1, [trackB + 4x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P0 LDG.E.CS loadB2, [trackB + 4x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P0 LDG.E.CS loadB3, [trackB + 4x<3>];\n",
+
+        j0c18 => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+        j0c20 => "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j0c33 => "--:-:3:-:1  \@P2 LDG.E.CI loadA0, [track0A + 4x<0>];\n",
+        j0c35 => "--:-:3:-:1  \@P2 LDG.E.CI loadA1, [track0A + 4x<1>];\n",
+        j0c37 => "--:-:3:-:1  \@P2 LDG.E.CI loadA2, [track0A + 4x<2>];\n",
+        j0c39 => "--:-:3:-:1  \@P2 LDG.E.CI loadA3, [track0A + 4x<3>];\n",
+
+        j1c29 => "--:-:4:-:1  \@P3 LDG.E.CI loadA4, [track1A + 4x<0>];\n",
+        j1c31 => "--:-:4:-:1  \@P3 LDG.E.CI loadA5, [track1A + 4x<1>];\n",
+        j1c33 => "--:-:4:-:1  \@P3 LDG.E.CI loadA6, [track1A + 4x<2>];\n",
+        j1c35 => "--:-:4:-:1  \@P3 LDG.E.CI loadA7, [track1A + 4x<3>];\n",
+
+        j5c29 => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 00>], loadA0;\n",
+        j5c31 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 00>], loadA1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 00>], loadA2;\n",
+        j5c35 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 00>], loadA3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   track0A0.CC, track0A0, 4x<8>;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X track0A1,    track0A1, RZ;\n",
+
+        j6c29 => "08:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 64>], loadA4;\n",
+        j6c31 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 64>], loadA5;\n",
+        j6c33 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 64>], loadA6;\n",
+        j6c35 => "--:2:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 64>], loadA7;\n",
+
+        j6c46 => "--:-:-:-:1  \@P0 IADD   track1A0.CC, track1A0, 4x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P0 IADD.X track1A1,    track1A1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+            )
+        ),
+
+        j4c21 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j4c22 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j4c27 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "02:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x64.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_32x128.sass b/Kernel/SGEMM/Pascal/sgemm_nn_32x128.sass
new file mode 100644
index 0000000..e25c3a9
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_nn_32x128.sass
@@ -0,0 +1,458 @@
+# Kernel: sgemm_nn_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*16*2 + (32*16 + 32)*2>
+    szShareA : (32*16 + 32)
+    szShareB : (128*16)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ tidAX, tidBX, lda, ldb, ldb4, ldaz, ldbz, tid1, tid3, tid96, ta, tb0, tb1, tb2, tb3, xmad_ta, xmad_tb, shiftAX, tidAY<1-3>, tidBY<1-3>, txb<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadA<0-3>
+      84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>
+
+    100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+--:-:3:-:1      S2R blkA, SR_CTAID.Y;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb4,  ldb, 2;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 31) << 2
+// tidBY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidBX, tid,   31;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   5;
+
+// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 4
+04:-:-:-:1      ISCADD   txa, blkA, tidAX, 5;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidAY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 2;
+
+// trackB += (blkB*128 + tidBX + ldb*tidBY) * 4
+02:-:-:-:1      ISCADD   txb, blkB, tidBX, 7;
+--:-:-:-:1      XMAD.LO2 tb0, ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb0, ldbz, blkZ,  tb0;
+--:-:-:-:1      IADD     tb1, tb0, ldb4;
+--:-:-:-:1      IADD     tb2, tb1, ldb4;
+--:-:-:-:1      IADD     tb3, tb2, ldb4;
+
+--:-:-:-:1      LEA      track0B0.CC, tb0, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track0B1,    tb0, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track1B0.CC, tb1, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track1B1,    tb1, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track2B0.CC, tb2, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track2B1,    tb2, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track3B0.CC, tb3, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track3B1,    tb3, param_B[1], RZ, 2;
+
+// writeAs = (tidAY*32 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*128 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 7;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+01:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readBs, readBs, tid96;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidBY1, tidBY, 4;
+--:-:-:-:1      IADD tidBY2, tidBY, 8;
+--:-:-:-:1      IADD tidBY3, tidBY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidBY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.128 load0B, [track0B];
+--:-:2:-:1  @P1 LDG.E.CI.128 load1B, [track1B];
+--:-:3:-:1  @P2 LDG.E.CI.128 load2B, [track2B];
+--:-:4:-:1  @P3 LDG.E.CI.128 load3B, [track3B];
+--:-:5:-:1  @P4 LDG.E.CI.128 loadA,  [trackA];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.128 load0B, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 load1B, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.128 load2B, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load3B, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128 loadA,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI load0B0, [track0B + 4x<0>];
+--:-:1:-:1  @P1 LDG.E.CI load0B1, [track0B + 4x<1>];
+--:-:1:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
+--:-:1:-:1  @P3 LDG.E.CI load0B3, [track0B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI load1B0, [track1B + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI load1B1, [track1B + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI load1B2, [track1B + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI load2B0, [track2B + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI load2B1, [track2B + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load2B2, [track2B + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI load2B3, [track2B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI load3B0, [track3B + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI load3B1, [track3B + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI load3B2, [track3B + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load3B3, [track3B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:5:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:5:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P0, RZ, k, 15;
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 16, P0;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS.128 [writeBs + 4x<0*128>], load0B;
+--:-:-:-:6      IADD   track0B0.CC, track0B0, ldb16;
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+02:-:-:-:1      STS.128 [writeBs + 4x<4*128>], load1B;
+--:-:-:-:6      IADD   track1B0.CC, track1B0, ldb16;
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+04:-:-:-:1      STS.128 [writeBs + 4x<8*128>], load2B;
+--:-:-:-:6      IADD   track2B0.CC, track2B0, ldb16;
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+08:-:-:-:1      STS.128 [writeBs + 4x<12*128>], load3B;
+--:-:-:-:6      IADD   track3B0.CC, track3B0, ldb16;
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+10:-:-:-:1      STS [writeAs + 4x<0*32>], loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*32>], loadA3;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.128 load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.128 load1B, [track1B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load2B, [track2B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load3B, [track3B];
+--:-:6:-:1  @P6 LDG.E.CI.128 loadA,  [trackA];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI load0B0, [track0B + 4x<0>];
+--:-:3:-:1  @P2 LDG.E.CI load0B1, [track0B + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
+--:-:3:-:1  @P2 LDG.E.CI load0B3, [track0B + 4x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI load1B0, [track1B + 4x<0>];
+--:-:4:-:1  @P3 LDG.E.CI load1B1, [track1B + 4x<1>];
+--:-:4:-:1  @P3 LDG.E.CI load1B2, [track1B + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load2B0, [track2B + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load2B1, [track2B + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load2B2, [track2B + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load2B3, [track2B + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load3B0, [track3B + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load3B1, [track3B + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load3B2, [track3B + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load3B3, [track3B + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI loadA0,  [trackA + 4x<0>];
+--:-:6:-:1  @P6 LDG.E.CI loadA1,  [trackA + 4x<1>];
+--:-:6:-:1  @P6 LDG.E.CI loadA2,  [trackA + 4x<2>];
+--:-:6:-:1  @P6 LDG.E.CI loadA3,  [trackA + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*128>], load0B;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeBs + 4x< 4*128>], load1B;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*128>], load2B;\n",
+        j9c6   => "--:5:-:-:1  \@P0 STS.128 [writeBs + 4x<12*128>], load3B;\n",
+        j11c6  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<0*32>], loadA0;\n",
+        j11c8  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], loadA1;\n",
+        j11c10 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], loadA2;\n",
+        j11c12 => "--:6:-:-:1  \@P0 STS [writeAs + 4x<3*32>], loadA3;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0B0.CC, track0B0, ldb16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0B1,    track0B1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2B0.CC, track2B0, ldb16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2B1,    track2B1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3B0.CC, track3B0, ldb16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3B1,    track3B1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackA0.CC,  trackA0, 4x<16>;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackA1,     trackA1, RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.128 load0B, [track0B];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.128 load1B, [track1B];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.128 load2B, [track2B];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.128 load3B, [track3B];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.128 loadA,  [trackA];\n",
+            ) :
+            (
+
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI load0B0, [track0B + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI load0B1, [track0B + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI load0B2, [track0B + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI load0B3, [track0B + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI load1B0, [track1B + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI load1B1, [track1B + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI load1B2, [track1B + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI load1B3, [track1B + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI load2B0, [track2B + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E.CI load2B1, [track2B + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E.CI load2B2, [track2B + 4x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E.CI load2B3, [track2B + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI load3B0, [track3B + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI load3B1, [track3B + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI load3B2, [track3B + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI load3B3, [track3B + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_common_32x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_nn_rnn_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_nn_rnn_128x32.sass
new file mode 100644
index 0000000..21b493d
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_nn_rnn_128x32.sass
@@ -0,0 +1,512 @@
+# Kernel: sgemm_nn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<(128*16 + 32)*2 + 32*16*2>
+    szShareA  : (128*16 + 32)
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]        : c[0x0][0x140]
+    param_C[1]        : c[0x0][0x144]
+    param_A[0]        : c[0x0][0x148]
+    param_A[1]        : c[0x0][0x14c]
+    param_B[0]        : c[0x0][0x150]
+    param_B[1]        : c[0x0][0x154]
+    param_bias[0]     : c[0x0][0x158]
+    param_bias[1]     : c[0x0][0x15c]
+    param_lockAddr[0] : c[0x0][0x160]
+    param_lockAddr[1] : c[0x0][0x164]
+    param_alpha       : c[0x0][0x168]
+    param_beta        : c[0x0][0x16c]
+    param_xcutoff     : c[0x0][0x170]
+    param_flags       : c[0x0][0x174]
+    param_lda         : c[0x0][0x178]
+    param_ldb8        : c[0x0][0x17c]
+    param_ldc         : c[0x0][0x180]
+    param_m           : c[0x0][0x184]
+    param_n           : c[0x0][0x188]
+    param_k           : c[0x0][0x18c]
+    param_ldaz        : c[0x0][0x190]
+    param_ldbz        : c[0x0][0x194]
+    param_ldcz        : c[0x0][0x198]
+    param_loops       : c[0x0][0x19c]
+    param_dimB        : c[0x0][0x1a0]
+    param_dimC        : c[0x0][0x1a4]
+    param_unrolling   : c[0x0][0x1a8]
+    param_numBlks     : c[0x0][0x1ac]
+    param_numAblks    : c[0x0][0x1b0]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, ldb, ldaz, lda32, ldbz, ta00, ta32, ta64, ta96, tb, tid1, tid3, tidAX, tidBX, tidAY<1-3>, txb<1-3>, xmad_ta, offsetB, shiftAX
+    80-81 : baseB<0-1>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, ldb16, k, tidAY, tidBY, txb, txa00, txa32, txa64, txa96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, time_step
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+    48-61 : bias00y<0-1>, bias04y<0-1>, bias08y<0-1>, bias12y<0-1>, b0, b1, b2, b3, baseC<0-1>
+    62-66 : blkId, nextBlk, lockAddr<0-1>, lockVal
+   67-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags, xcutoff, offsetC, numBlk
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+
+--:-:-:-:1      MOV time_step, RZ;
+--:-:-:-:1      MOV flags, param_flags;
+
+RNN_LOOP:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+--:-:-:-:1      SHL lda32, lda, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+--:-:-:-:6      LOP.AND.NZ   P0, RZ, flags, 4;
+--:-:-:-:6  @P0 IADD offsetB, -time_step, param_unrolling;
+--:-:-:-:6  @P0 IADD offsetB, offsetB,    -1;
+--:-:-:-:6 @!P0 MOV  offsetB, time_step;
+
+// baseB = param_B + dimB * time_step
+--:-:-:-:1      XMAD     offsetB,   offsetB,   param_dimB, RZ;
+--:-:-:-:1      LEA      baseB0.CC, offsetB,   param_B[0],     2;
+--:-:-:-:1      LEA.HI.X baseB1,    offsetB,   param_B[1], RZ, 2;
+
+// tidAX   = tid >> 2
+// tidAY   = (tid & 3) << 2
+// shiftAX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidAX,   tid,  2;
+01:-:-:-:1      LOP.AND tid3,    tid,  3;
+--:-:-:-:1      SHL     tidAY,   tid3, 2;
+--:-:-:-:1      SHL     shiftAX, tid3, 3;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += 2 * ((blkA*128 + tidAX) * lda + tidAY)
+02:-:-:-:1      ISCADD txa00, blkA, tidAX, 7;
+--:-:-:-:1      IADD   txa32, txa00, 32;
+--:-:-:-:1      IADD   txa64, txa00, 64;
+--:-:-:-:1      IADD   txa96, txa00, 96;
+
+--:-:-:-:1      XMAD.LO  ta00, lda,  txa00,   tidAY, xmad_ta;
+--:-:-:-:1      XMAD.LO2 ta00, ldaz, RZ,      ta00;
+--:-:-:-:1      IADD     ta32, ta00, lda32;
+--:-:-:-:1      IADD     ta64, ta32, lda32;
+--:-:-:-:1      IADD     ta96, ta64, lda32;
+
+--:-:-:-:1      LEA      track0A0.CC, ta00, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta00, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, ta32, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track1A1,    ta32, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track2A0.CC, ta64, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track2A1,    ta64, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track3A0.CC, ta96, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track3A1,    ta96, param_A[1], RZ, 2;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidBX, 5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+--:-:-:-:1      XMAD.LO2 tb,  ldbz, RZ,    tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, baseB0,     2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, baseB1, RZ, 2;
+
+// writeAs = (tidAY*128 + tidAX + shiftAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.128 load0A, [track0A];
+--:-:2:-:1  @P3 LDG.E.128 load1A, [track1A];
+--:-:3:-:1  @P4 LDG.E.128 load2A, [track2A];
+--:-:4:-:1  @P5 LDG.E.128 load3A, [track3A];
+--:-:5:-:1  @P6 LDG.E.128 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load1A, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128 load2A, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 load3A, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.128 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidAY1, tidAY, 1;
+--:-:-:-:1      IADD tidAY2, tidAY, 2;
+--:-:-:-:1      IADD tidAY3, tidAY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E load0A0, [track0A + 4x<0>];
+--:-:1:-:1  @P1 LDG.E load0A1, [track0A + 4x<1>];
+--:-:1:-:1  @P2 LDG.E load0A2, [track0A + 4x<2>];
+--:-:1:-:1  @P3 LDG.E load0A3, [track0A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E load1A0, [track1A + 4x<0>];
+--:-:2:-:1  @P1 LDG.E load1A1, [track1A + 4x<1>];
+--:-:2:-:1  @P2 LDG.E load1A2, [track1A + 4x<2>];
+--:-:2:-:1  @P3 LDG.E load1A3, [track1A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa64, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E load2A0, [track2A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E load2A1, [track2A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E load2A2, [track2A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E load2A3, [track2A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa96, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E load3A0, [track3A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E load3A1, [track3A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E load3A2, [track3A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E load3A3, [track3A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E loadB0, [trackB + 4x<0>];
+--:-:5:-:1  @P1 LDG.E loadB1, [trackB + 4x<1>];
+--:-:5:-:1  @P2 LDG.E loadB2, [trackB + 4x<2>];
+--:-:5:-:1  @P3 LDG.E loadB3, [trackB + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa00, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa32, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb,   param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P1, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS [writeAs + 4x<0*128 + 0*32>], load0A0;
+--:-:-:-:0      IADD   track0A0.CC, track0A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 0*32>], load0A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 0*32>], load0A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 0*32>], load0A3;
+
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS [writeAs + 4x<0*128 + 1*32>], load1A0;
+--:-:-:-:0      IADD   track1A0.CC, track1A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 1*32>], load1A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 1*32>], load1A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 1*32>], load1A3;
+
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS [writeAs + 4x<0*128 + 2*32>], load2A0;
+--:-:-:-:0      IADD   track2A0.CC, track2A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 2*32>], load2A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 2*32>], load2A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 2*32>], load2A3;
+
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS [writeAs + 4x<0*128 + 3*32>], load3A0;
+--:-:-:-:0      IADD   track3A0.CC, track3A0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*128 + 3*32>], load3A1;
+--:-:-:-:1      STS [writeAs + 4x<2*128 + 3*32>], load3A2;
+--:-:-:-:4      STS [writeAs + 4x<3*128 + 3*32>], load3A3;
+
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.128 load0A, [track0A];
+--:-:4:-:1  @P3 LDG.E.128 load1A, [track1A];
+--:-:5:-:1  @P4 LDG.E.128 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.128 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P2 LDG.E load0A0, [track0A + 4x<0>];
+--:-:3:-:1  @P2 LDG.E load0A1, [track0A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E load0A2, [track0A + 4x<2>];
+--:-:3:-:1  @P2 LDG.E load0A3, [track0A + 4x<3>];
+
+--:-:4:-:1  @P3 LDG.E load1A0, [track1A + 4x<0>];
+--:-:4:-:1  @P3 LDG.E load1A1, [track1A + 4x<1>];
+--:-:4:-:1  @P3 LDG.E load1A2, [track1A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E load1A3, [track1A + 4x<3>];
+
+--:-:5:-:1  @P4 LDG.E load2A0, [track2A + 4x<0>];
+--:-:5:-:1  @P4 LDG.E load2A1, [track2A + 4x<1>];
+--:-:5:-:1  @P4 LDG.E load2A2, [track2A + 4x<2>];
+--:-:5:-:1  @P4 LDG.E load2A3, [track2A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E load3A0, [track3A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E load3A1, [track3A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E load3A2, [track3A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E load3A3, [track3A + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P6 LDG.E loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P6 LDG.E loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P6 LDG.E loadB3, [trackB + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 0*32>], load0A0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 0*32>], load0A1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 0*32>], load0A2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 0*32>], load0A3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 1*32>], load1A0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 1*32>], load1A1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 1*32>], load1A2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 1*32>], load1A3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 2*32>], load2A0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 2*32>], load2A1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 2*32>], load2A2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 2*32>], load2A3;\n",
+
+        j9c6   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 3*32>], load3A0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 3*32>], load3A1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 3*32>], load3A2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 3*32>], load3A3;\n",
+
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 4x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 4x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2A0.CC, track2A0, 4x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, 4x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.128 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.128 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.128 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.128 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.128 loadB,  [trackB];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E load0A0, [track0A + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E load0A1, [track0A + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E load0A2, [track0A + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E load0A3, [track0A + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E load1A0, [track1A + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E load1A1, [track1A + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E load1A2, [track1A + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E load1A3, [track1A + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E load2A0, [track2A + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P4 LDG.E load2A1, [track2A + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P4 LDG.E load2A2, [track2A + 4x<2>];\n",
+                j10c3  => "--:-:5:-:1  \@P4 LDG.E load2A3, [track2A + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E load3A0, [track3A + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E load3A1, [track3A + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E load3A2, [track3A + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E load3A3, [track3A + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E loadB0, [trackB + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E loadB1, [trackB + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E loadB2, [trackB + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_rnn_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_nt_128x128.sass b/Kernel/SGEMM/Pascal/sgemm_nt_128x128.sass
new file mode 100644
index 0000000..e01b4b5
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_nt_128x128.sass
@@ -0,0 +1,339 @@
+# Kernel: sgemm_nt_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ blkA, blkB, blkZ, tidX, blk, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, tid127, txa, txb, xmad_ta, xmad_tb, tid128
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-67   ~ k1, k2, k3
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-111  : loadA<0-7>,  loadB<0-7>
+    112-115 : trackA<0-1>, trackB<0-1>
+
+    116-121 ~ writeS, k, tidY, ta, tb, loop
+    122-127 ~ readAs, readBs, tid
+
+    64-75   ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-121  ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,  param_k;
+--:-:-:-:1      LOP.AND tid1, tid,  1;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+--:-:-:-:1      MOV loop, RZ;
+--:-:-:-:1      MOV lda, param_lda;
+--:-:-:-:1      MOV ldb, param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// tidY  = tid1 << 2
+--:-:-:-:1      SHL tidY, tid1, 2;
+
+// tidX = tid >> 1
+01:-:-:-:1      SHR.U32 tidX, tid, 1;
+
+// trackA += 4 * ((blkA*128 + tidX) * lda + tidY)
+02:-:-:-:1      ISCADD   txa, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0], 0x2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x2;
+
+// trackB += 4 * ((blkB*128 + tidX) * ldb + tidY)
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 7;
+--:-:-:-:1      XMAD.LO  tb,  ldb,  txb,  tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0], 0x2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = 4 * (128 * tidY + tidX)
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+
+// k must be multiple of 8
+--:-:2:-:1  @P5 LDG.E.CI.128 loadA0, [trackA + 4x<0>];
+--:-:2:-:1  @P5 LDG.E.CI.128 loadA4, [trackA + 4x<8>];
+
+--:-:3:-:1  @P6 LDG.E.CI.128 loadB0, [trackB + 4x<0>];
+--:5:4:-:1  @P6 LDG.E.CI.128 loadB4, [trackB + 4x<8>];
+
+--:-:-:-:1 @!P5 LDS.U.128 loadA0, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.128 loadB0, [addr_zero];
+--:-:-:-:1 @!P5 LDS.U.128 loadA4, [addr_zero];
+--:-:-:-:1 @!P6 LDS.U.128 loadB4, [addr_zero];
+
+--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;
+
+22:-:-:-:1      STS [writeS + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeS + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeS + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeS + 4x<3*128>], loadA3;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, 4x<16>;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+24:-:-:-:1      STS [writeS + 4x< 8*128>], loadB0;
+--:-:-:-:1      STS [writeS + 4x< 9*128>], loadB1;
+--:-:-:-:1      STS [writeS + 4x<10*128>], loadB2;
+--:-:-:-:1      STS [writeS + 4x<11*128>], loadB3;
+
+10:-:-:-:6      IADD   trackB0.CC, trackB0, 4x<16>;
+--:-:-:-:1      IADD.X trackB1, trackB1, RZ;
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:0      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+    } : q{
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      IADD k1, tidY, 1;
+--:-:-:-:1      IADD k2, tidY, 2;
+--:-:-:-:1      IADD k3, tidY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;
+
+--:-:2:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P6;
+
+--:-:3:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+</SCHEDULE_BLOCK>
+
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 7;
+
+02:-:-:-:1      STS [writeS + 4x<0*128>], loadA0;
+--:-:-:-:1      STS [writeS + 4x<1*128>], loadA1;
+--:-:-:-:1      STS [writeS + 4x<2*128>], loadA2;
+--:-:-:-:1      STS [writeS + 4x<3*128>], loadA3;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, 4x<8>;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+04:-:-:-:1      STS [writeS + 4x< 8*128>], loadB0;
+--:-:-:-:1      STS [writeS + 4x< 9*128>], loadB1;
+--:-:-:-:1      STS [writeS + 4x<10*128>], loadB2;
+--:-:-:-:1      STS [writeS + 4x<11*128>], loadB3;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, 4x<8>;
+--:-:-:-:1      IADD.X trackB1, trackB1, RZ;
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $vec;
+    our @top = $vec ?
+        ("--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n") :
+        ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, 16, P5;\n");
+    our %insert =
+    (
+        ($vec ?
+            (
+        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",
+        j0c13 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P5;\n",
+        j0c14 => "--:-:-:-:1      PSETP.AND.AND P3, PT, P0, P1, P6;\n",
+
+        j0c27 => "--:-:2:-:1  \@P2 LDG.E.CI.128 loadA0, [trackA + 4x<0>];\n",
+        j0c29 => "--:-:2:-:1  \@P2 LDG.E.CI.128 loadA4, [trackA + 4x<8>];\n",
+
+        j0c31 => "--:-:3:-:1  \@P3 LDG.E.CI.128 loadB0, [trackB + 4x<0>];\n",
+        j0c33 => "08:5:4:-:1  \@P3 LDG.E.CI.128 loadB4, [trackB + 4x<8>];\n",
+
+        j3c29 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<0*128>], loadA4;\n",
+        j3c31 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<1*128>], loadA5;\n",
+        j3c33 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<2*128>], loadA6;\n",
+        j3c35 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<3*128>], loadA7;\n",
+
+        j4c29 => "--:-:-:-:1 \@!P1 STS [writeS + 4x< 8*128>], loadB4;\n",
+        j4c31 => "--:-:-:-:1 \@!P1 STS [writeS + 4x< 9*128>], loadB5;\n",
+        j4c33 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<10*128>], loadB6;\n",
+        j4c35 => "--:-:-:-:1 \@!P1 STS [writeS + 4x<11*128>], loadB7;\n",
+
+        j5c29 => "02:-:-:-:1  \@P1 STS [writeS + 4x<0*128>], loadA0;\n",
+        j5c31 => "--:-:-:-:1  \@P1 STS [writeS + 4x<1*128>], loadA1;\n",
+        j5c33 => "--:-:-:-:1  \@P1 STS [writeS + 4x<2*128>], loadA2;\n",
+        j5c35 => "--:-:-:-:1  \@P1 STS [writeS + 4x<3*128>], loadA3;\n",
+
+        j6c29 => "04:-:-:-:1  \@P1 STS [writeS + 4x< 8*128>], loadB0;\n",
+        j6c31 => "--:-:-:-:1  \@P1 STS [writeS + 4x< 9*128>], loadB1;\n",
+        j6c33 => "--:-:-:-:1  \@P1 STS [writeS + 4x<10*128>], loadB2;\n",
+        j6c35 => "--:2:-:-:1  \@P1 STS [writeS + 4x<11*128>], loadB3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 4x<16>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c46 => "10:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, 4x<16>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
+            ) :
+            (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 16, P6;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+        j0c29 => "--:-:2:-:1  \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+        j0c31 => "--:-:2:-:1  \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+        j0c33 => "--:-:2:-:1  \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j0c35 => "--:-:3:-:1  \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+        j1c27 => "--:-:3:-:1  \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+        j1c29 => "--:-:3:-:1  \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+        j1c31 => "--:-:3:-:1  \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+
+        j5c29 => "02:-:-:-:1  \@P0 STS [writeS + 4x<0*128>], loadA0;\n",
+        j5c31 => "--:-:-:-:1  \@P0 STS [writeS + 4x<1*128>], loadA1;\n",
+        j5c33 => "--:-:-:-:1  \@P0 STS [writeS + 4x<2*128>], loadA2;\n",
+        j5c35 => "--:-:-:-:1  \@P0 STS [writeS + 4x<3*128>], loadA3;\n",
+
+        j6c29 => "04:-:-:-:1  \@P0 STS [writeS + 4x< 8*128>], loadB0;\n",
+        j6c31 => "--:-:-:-:1  \@P0 STS [writeS + 4x< 9*128>], loadB1;\n",
+        j6c33 => "--:-:-:-:1  \@P0 STS [writeS + 4x<10*128>], loadB2;\n",
+        j6c35 => "--:2:-:-:1  \@P0 STS [writeS + 4x<11*128>], loadB3;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, 4x<8>;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c46 => "--:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, 4x<8>;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+            ),
+        ),
+
+        j6c63 => "02:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1      IADD32I k, k, -8;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_nt_32x128.sass b/Kernel/SGEMM/Pascal/sgemm_nt_32x128.sass
new file mode 100644
index 0000000..339c825
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_nt_32x128.sass
@@ -0,0 +1,483 @@
+# Kernel: sgemm_nt_32x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<(128*16 + 32)*2 + (32*16 + 32)*2>
+    szShareA : (32*16 + 32)
+    szShareB : (128*16 + 32)
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda       : c[0x0][0x164]
+    param_ldb       : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ tidX, lda, ldb, ldaz, ldbz, ldb32, tid1, tid3, tid96, ta, tb00, tb32, tb64, tb96, xmad_ta, xmad_tb, shiftX, tidY<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadA<0-3>
+      84-99 : load0B<0-3>, load1B<0-3>, load2B<0-3>, load3B<0-3>
+
+    100-109 : trackA<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>
+
+    110-120 ~ writeAs, writeBs, k, tidY, txa, txb00, txb32, txb64, txb96
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc12, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkB, SR_CTAID.Z;
+--:-:3:-:1      S2R blkA, SR_CTAID.Y;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda;
+--:-:-:-:1      MOV ldb,  param_ldb;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL ldb32, ldb, 5;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidX   = tid >> 2
+// tidY   = (tid & 3) << 2
+// shiftX = (tid & 3) << 3
+01:-:-:-:1      SHR.U32 tidX, tid,  2;
+01:-:-:-:1      LOP.AND tid3, tid,  3;
+--:-:-:-:1      SHL     tidY, tid3, 2;
+--:-:-:-:1      SHL     shiftX, tid3, 3;
+
+// trackA += ((blkA*32 + tidX) * lda + tidAY) * 4
+04:-:-:-:1      ISCADD   txa, blkA, tidX, 5;
+--:-:-:-:1      XMAD.LO  ta,  lda,  txa,  tidY, xmad_ta;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 2;
+
+// trackB += ((blkB*128 + tidX) * ldb + tidY) * 4
+02:-:-:-:1      ISCADD txb00, blkB, tidX, 7;
+--:-:-:-:1      IADD   txb32, txb00, 32;
+--:-:-:-:1      IADD   txb64, txb00, 64;
+--:-:-:-:1      IADD   txb96, txb00, 96;
+
+--:-:-:-:1      XMAD.LO  tb00, ldb,  txb00, tidY, xmad_tb;
+08:-:-:-:1      XMAD.LO2 tb00, ldbz, blkZ, tb00;
+--:-:-:-:1      IADD     tb32, tb00, ldb32;
+--:-:-:-:1      IADD     tb64, tb32, ldb32;
+--:-:-:-:1      IADD     tb96, tb64, ldb32;
+
+--:-:-:-:1      LEA      track0B0.CC, tb00, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track0B1,    tb00, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track1B0.CC, tb32, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track1B1,    tb32, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track2B0.CC, tb64, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track2B1,    tb64, param_B[1], RZ, 2;
+--:-:-:-:1      LEA      track3B0.CC, tb96, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X track3B1,    tb96, param_B[1], RZ, 2;
+
+// writeAs = (tidY*32 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 5;
+--:-:-:-:1      IADD   writeAs, writeAs, shiftX;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidY*128 + tidX + shiftX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 7;
+--:-:-:-:1      IADD   writeBs, writeBs, shiftX;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs = (((tid & 16) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    16;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+// readBs = (((tid >> 1) & 7) | ((tid & 96) >> 2)) << 4
+01:-:-:-:1      LOP.AND tid96,  tid,    96;
+--:-:-:-:1      SHR.U32 tid96,  tid96,  2;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      LOP.OR  readBs, readBs, tid96;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb64, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb96, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa,   param_m, PT;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P2;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P3;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidY, k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P2 LDG.E.CI.128 load0B, [track0B];
+--:-:2:-:1  @P3 LDG.E.CI.128 load1B, [track1B];
+--:-:3:-:1  @P4 LDG.E.CI.128 load2B, [track2B];
+--:-:4:-:1  @P5 LDG.E.CI.128 load3B, [track3B];
+--:-:5:-:1  @P6 LDG.E.CI.128 loadA,  [trackA];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P2 LDS.U.128 load0B, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load1B, [addr_zero];
+--:-:6:-:1 @!P4 LDS.U.128 load2B, [addr_zero];
+--:-:6:-:1 @!P5 LDS.U.128 load3B, [addr_zero];
+--:-:6:-:1 @!P6 LDS.U.128 loadA,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD tidY1, tidY, 1;
+--:-:-:-:1      IADD tidY2, tidY, 2;
+--:-:-:-:1      IADD tidY3, tidY, 3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI load0B0, [track0B + 4x<0>];
+--:-:1:-:1  @P1 LDG.E.CI load0B1, [track0B + 4x<1>];
+--:-:1:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
+--:-:1:-:1  @P3 LDG.E.CI load0B3, [track0B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0B0, RZ;
+--:-:-:-:1 @!P1 MOV load0B1, RZ;
+--:-:-:-:1 @!P2 MOV load0B2, RZ;
+--:-:-:-:1 @!P3 MOV load0B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb32, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI load1B0, [track1B + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI load1B1, [track1B + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI load1B2, [track1B + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1B0, RZ;
+--:-:-:-:1 @!P1 MOV load1B1, RZ;
+--:-:-:-:1 @!P2 MOV load1B2, RZ;
+--:-:-:-:1 @!P3 MOV load1B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, txb64, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P4;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI load2B0, [track2B + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI load2B1, [track2B + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load2B2, [track2B + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI load2B3, [track2B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2B0, RZ;
+--:-:-:-:1 @!P1 MOV load2B1, RZ;
+--:-:-:-:1 @!P2 MOV load2B2, RZ;
+--:-:-:-:1 @!P3 MOV load2B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txb96, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI load3B0, [track3B + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI load3B1, [track3B + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI load3B2, [track3B + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load3B3, [track3B + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3B0, RZ;
+--:-:-:-:1 @!P1 MOV load3B1, RZ;
+--:-:-:-:1 @!P2 MOV load3B2, RZ;
+--:-:-:-:1 @!P3 MOV load3B3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY,  k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY1, k, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY2, k, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY3, k, P6;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:5:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:5:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb00, param_n, PT;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb32, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;
+--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:1      LOP.AND.NZ P0, RZ, k, 15;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P0;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS [writeBs + 4x<0*128 + 0*32>], load0B0;
+--:-:-:-:0      IADD   track0B0.CC, track0B0, 4x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 0*32>], load0B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 0*32>], load0B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 0*32>], load0B3;
+
+--:-:-:-:0      IADD.X track0B1,    track0B1, RZ;
+
+02:-:-:-:1      STS [writeBs + 4x<0*128 + 1*32>], load1B0;
+--:-:-:-:0      IADD   track1B0.CC, track1B0, 4x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 1*32>], load1B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 1*32>], load1B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 1*32>], load1B3;
+
+--:-:-:-:0      IADD.X track1B1,    track1B1, RZ;
+
+04:-:-:-:1      STS [writeBs + 4x<0*128 + 2*32>], load2B0;
+--:-:-:-:0      IADD   track2B0.CC, track2B0, 4x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 2*32>], load2B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 2*32>], load2B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 2*32>], load2B3;
+
+--:-:-:-:0      IADD.X track2B1,    track2B1, RZ;
+
+08:-:-:-:1      STS [writeBs + 4x<0*128 + 3*32>], load3B0;
+--:-:-:-:0      IADD   track3B0.CC, track3B0, 4x<16>;
+--:-:-:-:1      STS [writeBs + 4x<1*128 + 3*32>], load3B1;
+--:-:-:-:1      STS [writeBs + 4x<2*128 + 3*32>], load3B2;
+--:-:-:-:4      STS [writeBs + 4x<3*128 + 3*32>], load3B3;
+
+--:-:-:-:0      IADD.X track3B1,    track3B1, RZ;
+
+10:-:-:-:1      STS [writeAs + 4x<0*32>], loadA0;
+--:-:-:-:0      IADD   trackA0.CC, trackA0, 4x<16>;
+--:-:-:-:1      STS [writeAs + 4x<1*32>], loadA1;
+--:-:-:-:1      STS [writeAs + 4x<2*32>], loadA2;
+--:-:-:-:1      STS [writeAs + 4x<3*32>], loadA3;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P2 LDG.E.CI.128 load0B, [track0B];
+--:-:4:-:1  @P3 LDG.E.CI.128 load1B, [track1B];
+--:-:5:-:1  @P4 LDG.E.CI.128 load2B, [track2B];
+--:-:5:-:1  @P5 LDG.E.CI.128 load3B, [track3B];
+--:-:6:-:1  @P6 LDG.E.CI.128 loadA,  [trackA];
+    } : q{
+--:-:3:-:1  @P2 LDG.E.CI load0B0, [track0B + 4x<0>];
+--:-:3:-:1  @P2 LDG.E.CI load0B1, [track0B + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load0B2, [track0B + 4x<2>];
+--:-:3:-:1  @P2 LDG.E.CI load0B3, [track0B + 4x<3>];
+
+--:-:4:-:1  @P3 LDG.E.CI load1B0, [track1B + 4x<0>];
+--:-:4:-:1  @P3 LDG.E.CI load1B1, [track1B + 4x<1>];
+--:-:4:-:1  @P3 LDG.E.CI load1B2, [track1B + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load1B3, [track1B + 4x<3>];
+
+--:-:5:-:1  @P4 LDG.E.CI load2B0, [track2B + 4x<0>];
+--:-:5:-:1  @P4 LDG.E.CI load2B1, [track2B + 4x<1>];
+--:-:5:-:1  @P4 LDG.E.CI load2B2, [track2B + 4x<2>];
+--:-:5:-:1  @P4 LDG.E.CI load2B3, [track2B + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load3B0, [track3B + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load3B1, [track3B + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load3B2, [track3B + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load3B3, [track3B + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:6:-:1  @P6 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:6:-:1  @P6 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:6:-:1  @P6 LDG.E.CI loadA3, [trackA + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 1;
+    our $shiftBX = 1;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 0*32>], load0B0;\n",
+        j3c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 0*32>], load0B1;\n",
+        j3c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 0*32>], load0B2;\n",
+        j3c12  => "--:3:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 0*32>], load0B3;\n",
+
+        j5c6   => "08:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 1*32>], load1B0;\n",
+        j5c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 1*32>], load1B1;\n",
+        j5c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 1*32>], load1B2;\n",
+        j5c12  => "--:4:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 1*32>], load1B3;\n",
+
+        j7c6   => "10:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 2*32>], load2B0;\n",
+        j7c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 2*32>], load2B1;\n",
+        j7c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 2*32>], load2B2;\n",
+        j7c12  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 2*32>], load2B3;\n",
+
+        j9c6   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<0*128 + 3*32>], load3B0;\n",
+        j9c8   => "--:-:-:-:1  \@P0 STS [writeBs + 4x<1*128 + 3*32>], load3B1;\n",
+        j9c10  => "--:-:-:-:1  \@P0 STS [writeBs + 4x<2*128 + 3*32>], load3B2;\n",
+        j9c12  => "--:5:-:-:1  \@P0 STS [writeBs + 4x<3*128 + 3*32>], load3B3;\n",
+
+        j11c6  => "20:-:-:-:1  \@P0 STS [writeAs + 4x<0*32>], loadA0;\n",
+        j11c8  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], loadA1;\n",
+        j11c10 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], loadA2;\n",
+        j11c12 => "--:6:-:-:1  \@P0 STS [writeAs + 4x<3*32>], loadA3;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0B0.CC, track0B0, 4x<16>;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0B1,    track0B1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, 4x<16>;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P4 IADD   track2B0.CC, track2B0, 4x<16>;\n",
+        j7c13  => "--:-:-:-:1  \@P4 IADD.X track2B1,    track2B1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3B0.CC, track3B0, 4x<16>;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3B1,    track3B1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackA0.CC, trackA0, 4x<16>;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j7c14  => "--:-:-:-:1      ISETP.GE.AND P4, PT, k, 32, P4;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.128 load0B, [track0B];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.128 load1B, [track1B];\n",
+                j9c29  => "10:-:5:-:1  \@P4 LDG.E.CI.128 load2B, [track2B];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.128 load3B, [track3B];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.128 loadA,  [trackA];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI load0B0, [track0B + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI load0B1, [track0B + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI load0B2, [track0B + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI load0B3, [track0B + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI load1B0, [track1B + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI load1B1, [track1B + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI load1B2, [track1B + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI load1B3, [track1B + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P4 LDG.E.CI load2B0, [track2B + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P4 LDG.E.CI load2B1, [track2B + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P4 LDG.E.CI load2B2, [track2B + 4x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P4 LDG.E.CI load2B3, [track2B + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI load3B0, [track3B + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI load3B1, [track3B + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI load3B2, [track3B + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI load3B3, [track3B + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_common_32x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_rnn_bprop_common_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_rnn_bprop_common_128x32.sass
new file mode 100644
index 0000000..9f5919a
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_rnn_bprop_common_128x32.sass
@@ -0,0 +1,362 @@
+# sgemm_common_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+--:-:-:-:5      MOV xcutoff, param_xcutoff;
+
+--:-:-:-:6      LOP.AND.NZ   P0, RZ, flags, 4;
+--:-:-:-:6  @P0 IADD offsetC, -time_step, param_unrolling;
+--:-:-:-:6  @P0 IADD offsetC, offsetC,    -1;
+--:-:-:-:6 @!P0 MOV  offsetC, time_step;
+
+// baseH = param_H + dimH * time_step
+--:-:-:-:1      XMAD     offsetH,   offsetC,   param_dimH, RZ;
+--:-:-:-:1      LEA      baseH0.CC, offsetH,   param_H[0],     2;
+--:-:-:-:1      LEA.HI.X baseH1,    offsetH,   param_H[1], RZ, 2;
+
+// baseC = param_C + dimC * time_step
+--:-:-:-:1      XMAD     offsetC,   offsetC,   param_dimC, RZ;
+--:-:-:-:1      LEA      baseC0.CC, offsetC,   param_C[0],     2;
+--:-:-:-:1      LEA.HI.X baseC1,    offsetC,   param_C[1], RZ, 2;
+
+// writeCs = (readAs / 4) * 32 + readBs;
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 3;
+
+// readCs = ((tid & 96) << 2) | (tid & 31)   << 2;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 2;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid31;
+--:-:-:-:1      ISCADD cx, blkB, tid31, 5;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, RZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, baseC0,     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, baseC1, RZ, 2;
+
+// Apply relu
+--:-:-:-:0      LOP.AND.NZ   P4, RZ, flags, 2;
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+
+--:-:-:-:1      MOV  ldh1, param_ldh;
+
+// H += (ldh*cy + cx) * 4
+--:-:-:-:1      XMAD.LO  ci, ldh1,  cy00, cx, xmad_c;
+--:-:-:-:1      LEA      H00y0.CC, ci, baseH0,     2;
+--:-:-:-:1      LEA.HI.X H00y1,    ci, baseH1, RZ, 2;
+
+--:-:-:-:1      SHL  ldh1, ldh1, 2;
+--:-:-:-:1      SHL  ldh4, ldh1, 2;
+--:-:-:-:1      SHL  ldh60, ldh1, 6;
+--:-:-:-:1      IADD ldh60, ldh60, -ldh4;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:4      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      MOV d0, RZ;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:4      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      MOV d1, RZ;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:3      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      MOV d2, RZ;
+--:-:-:-:1      MOV d3, RZ;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:1      IADD.X C12y1,    C08y1, RZ;
+
+--:-:-:-:6      IADD   H04y0.CC, H00y0, ldh4;
+--:-:-:-:1      IADD.X H04y1,    H00y1, RZ;
+--:-:-:-:6      IADD   H08y0.CC, H04y0, ldh4;
+--:-:-:-:1      IADD.X H08y1,    H04y1, RZ;
+--:-:-:-:6      IADD   H12y0.CC, H08y0, ldh4;
+--:-:-:-:0      IADD.X H12y1,    H08y1, RZ;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n" .
+            "--:-:-:-:6      IADD   H00y0.CC, H00y0, ldh60;\n" .
+            "--:-:-:-:1      IADD.X H00y1,    H00y1, RZ;\n" .
+            "--:-:-:-:6      IADD   H04y0.CC, H04y0, ldh60;\n" .
+            "--:-:-:-:1      IADD.X H04y1,    H04y1, RZ;\n" .
+            "--:-:-:-:6      IADD   H08y0.CC, H08y0, ldh60;\n" .
+            "--:-:-:-:1      IADD.X H08y1,    H08y1, RZ;\n" .
+            "--:-:-:-:6      IADD   H12y0.CC, H12y0, ldh60;\n" .
+            "--:-:-:-:1      IADD.X H12y1,    H12y1, RZ;\n" if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:1      MOV lockAddr0, param_lockAddr[0];
+--:-:-:-:1      MOV lockAddr1, param_lockAddr[1];
+
+// time_step = time_step + 1
+--:-:-:-:6      IADD time_step, time_step, 1;
+--:-:-:-:1      ISETP.LT.AND P0, PT, time_step, param_unrolling, PT;
+
+// Synchronize all blocks
+--:-:-:-:1      ISETP.NE.AND P1, PT, tid, RZ, PT;
+--:-:-:-:6      XMAD blkId, blkB, param_numAblks, blkA;
+--:-:-:-:6      IADD nextBlk, blkId, 1;
+--:-:-:-:8      ISETP.EQ.OR P2, PT, nextBlk, param_numBlks, P1;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      SSY SSY_TARGET1;
+--:-:-:-:d  @P1 SYNC;
+--:-:-:-:6  @P2 MOV nextBlk, RZ;
+
+SPINLOCK1:
+--:-:1:Y:2      ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk;
+01:-:-:Y:d      ISETP.NE.AND P1, PT, lockVal, blkId, PT;
+--:-:-:-:d  @P1 BRA.U SPINLOCK1;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET1:
+--:-:-:-:1      SSY SSY_TARGET2;
+--:-:-:-:d  @P2 SYNC;
+--:-:-:-:6      MOV nextBlk, RZ;
+
+SPINLOCK2:
+--:-:1:Y:2      ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk;
+01:-:-:Y:d      ISETP.NE.AND P1, PT, lockVal, RZ, PT;
+--:-:-:-:5  @P1 BRA.U SPINLOCK2;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET2:
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:f      MEMBAR.GL;
+
+//Loop back to beginning of GEMM loop
+--:-:-:Y:5  @P0 BRA.U RNN_LOOP;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LDG.E h0, [H00y];
+--:-:-:-:1      LDG.E h1, [H04y];
+--:-:-:-:1      LDG.E h2, [H08y];
+--:-:-:-:1      LDG.E h3, [H12y];
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y];
+--:-:2:-:1  @P1 LDG.E d1, [C04y];
+--:-:3:-:1  @P2 LDG.E d2, [C08y];
+--:-:4:-:1  @P3 LDG.E d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:3      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*32>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*32>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*32>];
+</SCHEDULE_BLOCK>
+
+--:-:-:-:1  P2R predSave, PR, RZ, 0x0f;
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, d3, beta, c3;
+
+//Bprop for activation: Rectlinclip
+<SCHEDULE_BLOCK>
+--:-:-:-:1  FSETP.LT.AND P0, PT, RZ, h0, PT;
+--:-:-:-:1  FSETP.LT.AND P1, PT, RZ, h1, PT;
+--:-:-:-:1  FSETP.LT.AND P2, PT, RZ, h2, PT;
+--:-:-:-:1  FSETP.LT.AND P3, PT, RZ, h3, PT;
+--:-:-:-:1  FSETP.LT.AND P0, PT, h0, xcutoff, P0;
+--:-:-:-:1  FSETP.LT.AND P1, PT, h1, xcutoff, P1;
+--:-:-:-:1  FSETP.LT.AND P2, PT, h2, xcutoff, P2;
+--:-:-:-:1  FSETP.LT.AND P3, PT, h3, xcutoff, P3;
+--:-:-:-:1  SEL c0, c0, RZ, P0;
+--:-:-:-:1  SEL c1, c1, RZ, P1;
+--:-:-:-:1  SEL c2, c2, RZ, P2;
+--:-:-:-:1  SEL c3, c3, RZ, P3;
+</SCHEDULE_BLOCK>
+
+--:-:-:Y:d  R2P PR, predSave, 0x0f;
+
+--:1:-:-:1  @P0 STG.E [C00y], c0;
+--:2:-:-:1  @P1 STG.E [C04y], c1;
+--:3:-:-:1  @P2 STG.E [C08y], c2;
+--:4:-:-:1  @P3 STG.E [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:6      IADD   H00y0.CC, H00y0, ldh1;
+--:-:-:-:1      IADD.X H00y1,    H00y1, RZ;
+--:-:-:-:6      IADD   H04y0.CC, H04y0, ldh1;
+--:-:-:-:1      IADD.X H04y1,    H04y1, RZ;
+--:-:-:-:6      IADD   H08y0.CC, H08y0, ldh1;
+--:-:-:-:1      IADD.X H08y1,    H08y1, RZ;
+--:-:-:-:6      IADD   H12y0.CC, H12y0, ldh1;
+--:-:-:-:0      IADD.X H12y1,    H12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/sgemm_rnn_common_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_rnn_common_128x32.sass
new file mode 100644
index 0000000..67bda6f
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_rnn_common_128x32.sass
@@ -0,0 +1,348 @@
+# sgemm_common_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+--:-:1:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*32  + 00 + 0*8>];
+--:-:1:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay0, [readAs + 4x<1*128 + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Bx0, [readBs + 4x<1*32  + 00 + 0*8>];
+--:-:2:-:1      LDS.U.128 j1Ay4, [readAs + 4x<1*128 + 64 + 0*8>];
+
+LOOP:
+
+<CODE>
+
+    our @top;
+    our %insert;
+    our $shiftAX;
+    our $shiftBX;
+
+    my @cOrder;
+    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
+    my @y = (0,1,4,5);
+    foreach my $x (0,2)
+    {
+        foreach my $y (@y)
+        {
+            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
+        }
+        @y = reverse @y;
+    }
+
+    my $out = join '', @top;
+
+    foreach my $j (0 .. 15)
+    {
+        my $barrier   = $j & 1 ? 2 : 1;
+        my $rsPred    = $j >= 14 ? '@P0' : '   ';
+        my $loadReg   = ($j + 2) & 3;
+        my $shareLine = ($j + 2) & 15;
+        my $shiftA    = $shiftAX ? $shareLine >> 2 : 0;
+        my $shiftB    = $shiftBX ? $shareLine >> 2 : 0;
+        my $compute   = $j & 3;
+
+
+        $insert{"j${j}c0"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*128 + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+        $insert{"j${j}c2"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*32  + 00 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftB;
+        $insert{"j${j}c4"} = sprintf "--:-:%d:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*128 + 64 + %d*8>];\n", $barrier, $rsPred, $loadReg, $shareLine, $shiftA;
+
+        foreach my $c (0 .. 31)
+        {
+            my ($x,$y) = @{$cOrder[$c]};
+
+            my $ins    = $insert{"j${j}c$c"} || '';
+
+            my $wait   = $c == 0 ? "0$barrier" : '--';
+
+            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;
+
+            my $yield  = $c == 16 && $stall ? 'Y' : '-';
+
+            my $ctrl   = "$wait:-:-:$yield:$stall";
+
+            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $compute,$x,  $compute,$y,  $x,$y,  $ins;
+        }
+    }
+    return $out;
+
+</CODE>
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      MOV alpha, param_alpha;
+--:-:-:-:1      MOV beta,  param_beta;
+--:-:-:-:1      MOV flags, param_flags;
+--:-:-:-:5      MOV xcutoff, param_xcutoff;
+
+--:-:-:-:6      LOP.AND.NZ   P0, RZ, flags, 4;
+--:-:-:-:6  @P0 IADD offsetC, -time_step, param_unrolling;
+--:-:-:-:6  @P0 IADD offsetC, offsetC,    -1;
+--:-:-:-:6 @!P0 MOV  offsetC, time_step;
+
+// baseC = param_C + dimC * time_step
+--:-:-:-:1      XMAD     offsetC,   offsetC,   param_dimC, RZ;
+--:-:-:-:1      LEA      baseC0.CC, offsetC,   param_C[0],     2;
+--:-:-:-:1      LEA.HI.X baseC1,    offsetC,   param_C[1], RZ, 2;
+
+// writeCs = (readAs / 4) * 32 + readBs;
+--:-:-:-:1      ISETP.GT.AND P0, PT, swapBuf, RZ, PT;
+--:-:-:-:1      IADD readBs,  readBs, -4x<szShareA>;
+--:-:-:-:1  @P0 IADD readAs,  readAs, -swapBuf;
+--:-:-:-:1  @P0 IADD readBs,  readBs, -swapBuf;
+--:-:-:-:1      ISCADD  writeCs, readAs, readBs, 3;
+
+// readCs = ((tid & 96) << 2) | (tid & 31)   << 2;
+--:-:-:-:1      LOP.AND tid31, tid, 31;
+--:-:-:-:1      LOP.AND tid96, tid, 96;
+--:-:-:-:1      ISCADD readCs, tid96, tid31, 2;
+--:-:-:-:1      SHL    readCs, readCs, 2;
+
+// cx = blkB*32 + tid31;
+--:-:-:-:1      ISCADD cx, blkB, tid31, 5;
+
+// cy = blkA*128 + (tid96 >> 1)
+--:-:-:-:1      SHR.U32 cy00, tid96, 1;
+--:-:-:-:1      ISCADD  cy00, blkA, cy00, 7;
+
+// C += (cy*ldc + cx) * 4;
+// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
+--:-:-:-:1      MOV  ldc,  param_ldc;
+--:-:-:-:1      MOV  ldcz, param_ldcz;
+--:-:-:-:1      XMAD.LO  ci, ldc,  cy00, cx, xmad_c;
+--:-:-:-:1      XMAD.LO2 ci, ldcz, RZ, ci;
+--:-:-:-:1      LEA      C00y0.CC, ci, baseC0,     2;
+--:-:-:-:1      LEA.HI.X C00y1,    ci, baseC1, RZ, 2;
+
+// Apply relu
+--:-:-:-:0      LOP.AND.NZ   P4, RZ, flags, 2;
+// cx < n
+--:-:-:-:1      ISETP.LT.AND P6, PT, cx, param_n, PT;
+// beta != 0
+--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P6;
+
+
+--:-:-:-:1      SHL  ldc1, ldc, 2;
+--:-:-:-:1      SHL  ldc4, ldc, 4;
+--:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:4      IADD   C04y0.CC, C00y0, ldc4;
+--:-:-:-:1      MOV d0, RZ;
+--:-:-:-:1      IADD   cy04, cy00,  4;
+--:-:-:-:1      IADD.X C04y1,    C00y1, RZ;
+--:-:-:-:4      IADD   C08y0.CC, C04y0, ldc4;
+--:-:-:-:1      MOV d1, RZ;
+--:-:-:-:1      IADD   cy08, cy00,  8;
+--:-:-:-:1      IADD.X C08y1,    C04y1, RZ;
+--:-:-:-:3      IADD   C12y0.CC, C08y0, ldc4;
+--:-:-:-:1      MOV d2, RZ;
+--:-:-:-:1      MOV d3, RZ;
+--:-:-:-:1      IADD   cy12, cy00,  12;
+--:-:-:-:1      IADD.X C12y1,    C08y1, RZ;
+
+<SCHEDULE_BLOCK>
+// bias_track = bias + cy
+--:-:-:-:1      LEA      bias00y0.CC, cy00, param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X bias00y1,    cy00, param_bias[1], RZ, 2;
+--:-:-:-:1      LEA      bias04y0.CC, cy04, param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X bias04y1,    cy04, param_bias[1], RZ, 2;
+--:-:-:-:1      LEA      bias08y0.CC, cy08, param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X bias08y1,    cy08, param_bias[1], RZ, 2;
+--:-:-:-:1      LEA      bias12y0.CC, cy12, param_bias[0],     2;
+--:-:-:-:1      LEA.HI.X bias12y1,    cy12, param_bias[1], RZ, 2;
+</SCHEDULE_BLOCK>
+
+--:-:-:-:5      BAR.SYNC 0;
+
+<CODE>
+
+    my $out;
+    foreach my $y (0..7)
+    {
+        $out .=
+            "--:-:-:-:5      IADD   C00y0.CC, C00y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy00,     cy00,  60;\n" .
+            "--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C04y0.CC, C04y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy04,     cy04,  60;\n" .
+            "--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C08y0.CC, C08y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy08,     cy08,  60;\n" .
+            "--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;\n" .
+            "--:-:-:-:5      IADD   C12y0.CC, C12y0, ldc60;\n" .
+            "--:-:-:-:1      IADD   cy12,     cy12,  60;\n" .
+            "--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;\n\n" .
+            "--:-:-:-:6      IADD   bias00y0.CC, bias00y0, 240;\n" .
+            "--:-:-:-:1      IADD.X bias00y1, bias00y1, RZ;\n" .
+            "--:-:-:-:6      IADD   bias04y0.CC, bias04y0, 240;\n" .
+            "--:-:-:-:1      IADD.X bias04y1, bias04y1, RZ;\n" .
+            "--:-:-:-:6      IADD   bias08y0.CC, bias08y0, 240;\n" .
+            "--:-:-:-:1      IADD.X bias08y1, bias08y1, RZ;\n" .
+            "--:-:-:-:6      IADD   bias12y0.CC, bias12y0, 240;\n" .
+            "--:-:-:-:1      IADD.X bias12y1, bias12y1, RZ;\n" if $y == 4;
+
+        $out .= sprintf(
+            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
+            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
+            "--:-:-:-:0      FMUL c3, cx3y%d, alpha;\n",
+            ($y) x 4);
+
+        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
+    }
+    return $out;
+
+</CODE>
+
+--:-:-:-:1      MOV lockAddr0, param_lockAddr[0];
+--:-:-:-:1      MOV lockAddr1, param_lockAddr[1];
+
+// time_step = time_step + 1
+--:-:-:-:6      IADD time_step, time_step, 1;
+--:-:-:-:1      ISETP.LT.AND P0, PT, time_step, param_unrolling, PT;
+
+// Synchronize all blocks
+--:-:-:-:1      ISETP.NE.AND P1, PT, tid, RZ, PT;
+--:-:-:-:6      XMAD blkId, blkB, param_numAblks, blkA;
+--:-:-:-:6      IADD nextBlk, blkId, 1;
+--:-:-:-:8      ISETP.EQ.OR P2, PT, nextBlk, param_numBlks, P1;
+
+--:-:-:-:5      BAR.SYNC 0;
+
+--:-:-:-:1      SSY SSY_TARGET1;
+--:-:-:-:d  @P1 SYNC;
+--:-:-:-:6  @P2 MOV nextBlk, RZ;
+
+SPINLOCK1:
+--:-:1:Y:2      ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk;
+01:-:-:Y:d      ISETP.NE.AND P1, PT, lockVal, blkId, PT;
+--:-:-:-:d  @P1 BRA.U SPINLOCK1;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET1:
+--:-:-:-:1      SSY SSY_TARGET2;
+--:-:-:-:d  @P2 SYNC;
+--:-:-:-:6      MOV nextBlk, RZ;
+
+SPINLOCK2:
+--:-:1:Y:2      ATOM.E.CAS lockVal, [lockAddr], blkId, nextBlk;
+01:-:-:Y:d      ISETP.NE.AND P1, PT, lockVal, RZ, PT;
+--:-:-:-:5  @P1 BRA.U SPINLOCK2;
+--:-:-:-:d      SYNC;
+
+SSY_TARGET2:
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:f      MEMBAR.GL;
+
+//Loop back to beginning of GEMM loop
+--:-:-:Y:5  @P0 BRA.U RNN_LOOP;
+
+--:-:-:-:5      EXIT;
+
+STORE_C:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      LDG.E.CI b0, [bias00y];
+--:-:-:-:1      LDG.E.CI b1, [bias04y];
+--:-:-:-:1      LDG.E.CI b2, [bias08y];
+--:-:-:-:1      LDG.E.CI b3, [bias12y];
+</SCHEDULE_BLOCK>
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P5;
+
+--:-:1:-:1  @P0 LDG.E d0, [C00y];
+--:-:2:-:1  @P1 LDG.E d1, [C04y];
+--:-:3:-:1  @P2 LDG.E d2, [C08y];
+--:-:4:-:1  @P3 LDG.E d3, [C12y];
+--:-:-:-:1 @!P0 MOV d0, RZ;
+--:-:-:-:1 @!P1 MOV d1, RZ;
+--:-:-:-:1 @!P2 MOV d2, RZ;
+--:-:-:-:1 @!P3 MOV d3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, cy04, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, cy08, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, param_m, P6;
+
+--:-:-:-:1      IADD cy00, cy00, 1;
+--:-:-:-:1      IADD cy04, cy04, 1;
+--:-:-:-:1      IADD cy08, cy08, 1;
+--:-:-:-:3      IADD cy12, cy12, 1;
+
+--:-:-:-:1  @P4 FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c2, c2, RZ, !PT;
+--:-:-:-:1  @P4 FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1      STS.128 [writeCs], c0;
+--:-:-:-:1      LDS c0, [readCs + 4x<0*32>];
+--:-:5:-:1      LDS c1, [readCs + 4x<1*32>];
+--:-:-:-:1      LDS c2, [readCs + 4x<2*32>];
+--:-:6:-:1      LDS c3, [readCs + 4x<3*32>];
+</SCHEDULE_BLOCK>
+
+11:-:-:-:1  @P5 FFMA c0, d0, beta, c0;
+02:-:-:-:1  @P5 FFMA c1, d1, beta, c1;
+24:-:-:-:1  @P5 FFMA c2, d2, beta, c2;
+08:-:-:-:3  @P5 FFMA c3, d3, beta, c3;
+
+--:-:-:-:1  FADD c0, c0, b0;
+--:-:-:-:1  FADD c1, c1, b1;
+--:-:-:-:1  FADD c2, c2, b2;
+--:-:-:-:3  FADD c3, c3, b3;
+
+//Activation function: Rectlinclip
+<SCHEDULE_BLOCK>
+--:-:-:-:1  FMNMX c0, c0, RZ, !PT;
+--:-:-:-:1  FMNMX c1, c1, RZ, !PT;
+--:-:-:-:1  FMNMX c2, c2, RZ, !PT;
+--:-:-:-:3  FMNMX c3, c3, RZ, !PT;
+
+--:-:-:-:1  FMNMX c0, c0, xcutoff, PT;
+--:-:-:-:1  FMNMX c1, c1, xcutoff, PT;
+--:-:-:-:1  FMNMX c2, c2, xcutoff, PT;
+--:-:-:-:3  FMNMX c3, c3, xcutoff, PT;
+</SCHEDULE_BLOCK>
+
+--:1:-:-:1  @P0 STG.E [C00y], c0;
+--:2:-:-:1  @P1 STG.E [C04y], c1;
+--:3:-:-:1  @P2 STG.E [C08y], c2;
+--:4:-:-:1  @P3 STG.E [C12y], c3;
+
+01:-:-:-:6      IADD   C00y0.CC, C00y0, ldc1;
+--:-:-:-:1      IADD.X C00y1,    C00y1, RZ;
+02:-:-:-:6      IADD   C04y0.CC, C04y0, ldc1;
+--:-:-:-:1      IADD.X C04y1,    C04y1, RZ;
+04:-:-:-:6      IADD   C08y0.CC, C08y0, ldc1;
+--:-:-:-:1      IADD.X C08y1,    C08y1, RZ;
+08:-:-:-:6      IADD   C12y0.CC, C12y0, ldc1;
+--:-:-:-:1      IADD.X C12y1,    C12y1, RZ;
+
+--:-:-:-:6      IADD   bias00y0.CC, bias00y0, 4;
+--:-:-:-:1      IADD.X bias00y1,    bias00y1, RZ;
+--:-:-:-:6      IADD   bias04y0.CC, bias04y0, 4;
+--:-:-:-:1      IADD.X bias04y1,    bias04y1, RZ;
+--:-:-:-:6      IADD   bias08y0.CC, bias08y0, 4;
+--:-:-:-:1      IADD.X bias08y1,    bias08y1, RZ;
+--:-:-:-:6      IADD   bias12y0.CC, bias12y0, 4;
+--:-:-:-:0      IADD.X bias12y1,    bias12y1, RZ;
+
+--:-:-:-:5      RET;
diff --git a/Kernel/SGEMM/Pascal/sgemm_tn_128x128.sass b/Kernel/SGEMM/Pascal/sgemm_tn_128x128.sass
new file mode 100644
index 0000000..5099001
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_tn_128x128.sass
@@ -0,0 +1,279 @@
+# Kernel: sgemm_tn_128x128
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*4>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ blkA, blkB, blkZ, lda, ldb, ldaz, ldbz, tid1, tid7, tidX, blk, tid31, tid128
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-95   ~ x<1-3>, y<1-3>
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+    96-103  : loadA<0-3>, loadB<0-3>
+
+    104-107 : trackA<0-1>, trackB<0-1>
+
+    108-121 ~ writeS, lda8, k, tidY, txa, txb, ta, tb, loop
+    122-127 ~ readAs, readBs, tid
+
+    64-75   ~ ldc, ci, xmad_c, tid_31, tid_96, tid_128, blockA, blockB, blockZ
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-121  ~ ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|64>, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k, param_k;
+--:-:-:-:1      MOV loop, RZ;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+        join('', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15);
+</CODE>
+
+// tidX = (tid & 31) << 2
+// tidY = (tid >> 5) & 7
+01:-:-:-:1      LOP.AND tid31,  tid,  31;
+--:-:-:-:1      SHL     tidX,   tid31, 2;
+--:-:-:-:1      BFE.U32 tidY,   tid,  0x305; // 3 bits at position 5
+
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 5;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+// trackA += (blkA*128 + lda*tidY + tidX) * 2
+02:-:-:-:1      ISCADD   txa, blkA, tidX, 7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+
+// trackB += (blkB*128 + ldb*tidY + tidX) * 2
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 7;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// writeS = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD  writeS, tidY, tidX, 7;
+--:-:-:-:1      SHL     writeS, writeS, 2;
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4
+--:-:-:-:1      LOP.AND tid1,   tid,  1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+
+
+// readBs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4 + 4096;
+--:-:-:-:1      LOP.AND tid128, tid,  128;
+--:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      SHR.U32 readBs, tid128, 4;
+--:-:-:-:1      LOP.OR  readBs, readBs, tid7;
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:1      LOP.AND.NZ P4, RZ, k, 7;
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 8, P4;
+
+// doLoad = tidY < k && txa|txb < n|m
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P6;
+
+--:-:2:-:1  @P2 LDG.E.CI.128 loadA, [trackA];
+--:-:3:-:1  @P3 LDG.E.CI.128 loadB, [trackB];
+
+--:-:5:-:1 @!P2 LDS.U.128 loadA, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 loadB, [addr_zero];
+
+    // Vec 4 and scalar loads
+    } : q{
+
+// doLoadA = tidY < k && txa < m
+// doLoadB = tidY < k && txb < n
+--:-:-:-:1      IADD x1, txa, 1;
+--:-:-:-:1      IADD x2, txa, 2;
+--:-:-:-:1      IADD x3, txa, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_m, P0;
+
+--:-:2:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      IADD y1, txb, 1;
+--:-:-:-:1      IADD y2, txb, 2;
+--:-:-:-:1      IADD y3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_n, P0;
+
+--:-:3:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 8, PT;
+    };
+</CODE>
+
+</SCHEDULE_BLOCK>
+
+12:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadA0;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:0      IADD.X trackA1, trackA1, RZ;
+
+24:-:-:-:1      STS.128 [writeS + 4x<8*128>], loadB0;
+
+--:-:-:-:1      IADD   trackB0.CC, trackB0, param_ldb8;
+
+--:-:-:-:1      LOP.XOR readAs, readAs, 4x<128*8*2>;
+--:-:-:-:0      LOP.XOR readBs, readBs, 4x<128*8*2>;
+01:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      LOP.XOR writeS, writeS, 4x<128*8*2>;
+
+--:-:-:-:0      IADD.X trackB1, trackB1, RZ;
+
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P5;\n");
+
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P6;\n",
+        j0c8  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        ($vec ?
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI.128 loadA, [trackA];\n",
+        j0c13 => "--:-:3:-:1  \@P3 LDG.E.CI.128 loadB, [trackB];\n",
+            ) :
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+        j0c29 => "--:-:2:-:1  \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+        j0c31 => "--:-:2:-:1  \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+        j0c33 => "--:-:2:-:1  \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j0c35 => "--:-:3:-:1  \@P3 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+        j1c29 => "--:-:3:-:1  \@P3 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+        j1c31 => "--:-:3:-:1  \@P3 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+        j1c33 => "--:-:3:-:1  \@P3 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j5c33 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*128>], loadA0;\n",
+
+        j5c46 => "--:-:-:-:1  \@P2 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j5c54 => "--:-:-:-:1  \@P2 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c33 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<8*128>], loadB0;\n",
+
+        j6c46 => "--:-:-:-:1  \@P3 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j6c54 => "--:-:-:-:1  \@P3 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<128*8*2>;\n" .
+                 "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x128.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_tn_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_tn_128x32.sass
new file mode 100644
index 0000000..0b9ffc1
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_tn_128x32.sass
@@ -0,0 +1,447 @@
+# Kernel: sgemm_tn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*16*2 + 32*16*2>
+    szShareA  : 128*16
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 5;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL lda16, lda, 6;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+--:-:-:-:1      SHL lda4,  lda, 2;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+// tidAX = (tid & 31) << 2
+// tidAY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidAX, tid,   31;
+--:-:-:-:1      SHL     tidAX, tidAX, 2;
+--:-:-:-:1      SHR.U32 tidAY, tid,   5;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += (blkA*128 + tidAX + lda*tidAY) * 4
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO2 ta0, lda,  tidAY, txa;
+08:-:-:-:1      XMAD.LO2 ta0, ldaz, blkZ,  ta0;
+--:-:-:-:1      IADD     ta1, ta0, lda4;
+--:-:-:-:1      IADD     ta2, ta1, lda4;
+--:-:-:-:1      IADD     ta3, ta2, lda4;
+
+--:-:-:-:1      LEA      track0A0.CC, ta0, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta0, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, ta1, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track1A1,    ta1, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track2A0.CC, ta2, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track2A1,    ta2, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track3A0.CC, ta3, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track3A1,    ta3, param_A[1], RZ, 2;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidBX,  5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 2;
+
+// writeAs = (tidAY*128 + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidAY1, tidAY, 4;
+--:-:-:-:1      IADD tidAY2, tidAY, 8;
+--:-:-:-:1      IADD tidAY3, tidAY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI.128 load0A, [track0A];
+--:-:2:-:1  @P1 LDG.E.CI.128 load1A, [track1A];
+--:-:3:-:1  @P2 LDG.E.CI.128 load2A, [track2A];
+--:-:4:-:1  @P3 LDG.E.CI.128 load3A, [track3A];
+--:-:5:-:1  @P4 LDG.E.CI.128 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.128 load0A, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 load1A, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.128 load2A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load3A, [addr_zero];
+--:-:6:-:2 @!P4 LDS.U.128 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD txa1,  txa,  1;
+--:-:-:-:1      IADD txa2,  txa,  2;
+--:-:-:-:1      IADD txa3,  txa,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.CI load0A0, [track0A + 4x<0>];
+--:-:1:-:1  @P1 LDG.E.CI load0A1, [track0A + 4x<1>];
+--:-:1:-:1  @P2 LDG.E.CI load0A2, [track0A + 4x<2>];
+--:-:1:-:1  @P3 LDG.E.CI load0A3, [track0A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E.CI load1A0, [track1A + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI load1A1, [track1A + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI load1A2, [track1A + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI load1A3, [track1A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E.CI load2A0, [track2A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E.CI load2A1, [track2A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E.CI load2A2, [track2A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E.CI load2A3, [track2A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E.CI load3A0, [track3A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI load3A1, [track3A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI load3A2, [track3A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI load3A3, [track3A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:5:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:5:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:5:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 15;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS.128 [writeAs + 4x<0*128>], load0A;
+--:-:-:-:6      IADD   track0A0.CC, track0A0, lda16;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<4*128>], load1A;
+--:-:-:-:6      IADD   track1A0.CC, track1A0, lda16;
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS.128 [writeAs + 4x<8*128>], load2A;
+--:-:-:-:6      IADD   track2A0.CC, track2A0, lda16;
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS.128 [writeAs + 4x<12*128>], load3A;
+--:-:-:-:6      IADD   track3A0.CC, track3A0, lda16;
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1,    trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P5 LDG.E.CI.128 load0A, [track0A];
+--:-:4:-:1  @P5 LDG.E.CI.128 load1A, [track1A];
+--:-:5:-:1  @P5 LDG.E.CI.128 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.CI.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.CI.128 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P5 LDG.E.CI load0A0, [track0A + 4x<0>];
+--:-:3:-:1  @P5 LDG.E.CI load0A1, [track0A + 4x<1>];
+--:-:3:-:1  @P5 LDG.E.CI load0A2, [track0A + 4x<2>];
+--:-:3:-:1  @P5 LDG.E.CI load0A3, [track0A + 4x<3>];
+
+--:-:4:-:1  @P5 LDG.E.CI load1A0, [track1A + 4x<0>];
+--:-:4:-:1  @P5 LDG.E.CI load1A1, [track1A + 4x<1>];
+--:-:4:-:1  @P5 LDG.E.CI load1A2, [track1A + 4x<2>];
+--:-:4:-:1  @P5 LDG.E.CI load1A3, [track1A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load2A0, [track2A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load2A1, [track2A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load2A2, [track2A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load2A3, [track2A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E.CI load3A0, [track3A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E.CI load3A1, [track3A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E.CI load3A2, [track3A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E.CI load3A3, [track3A + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P6 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P6 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P6 LDG.E.CI loadB3, [trackB + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 0;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n",
+        j9c6   => "--:5:-:-:1  \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n",
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, lda16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, lda16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2A0.CC, track2A0, lda16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, lda16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.CI.128 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI.128 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.CI.128 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.CI.128 loadB,  [trackB];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.CI load2A0, [track2A + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E.CI load2A1, [track2A + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E.CI load2A2, [track2A + 4x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E.CI load2A3, [track2A + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_common_128x32.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_tn_128x64.sass b/Kernel/SGEMM/Pascal/sgemm_tn_128x64.sass
new file mode 100644
index 0000000..74f13cc
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_tn_128x64.sass
@@ -0,0 +1,326 @@
+# Kernel: sgemm_tn_128x64
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]      : c[0x0][0x140]
+    param_C[1]      : c[0x0][0x144]
+    param_A[0]      : c[0x0][0x148]
+    param_A[1]      : c[0x0][0x14c]
+    param_B[0]      : c[0x0][0x150]
+    param_B[1]      : c[0x0][0x154]
+    param_alpha     : c[0x0][0x158]
+    param_beta      : c[0x0][0x15c]
+    param_flags     : c[0x0][0x160]
+    param_lda8      : c[0x0][0x164]
+    param_ldb8      : c[0x0][0x168]
+    param_ldc       : c[0x0][0x16c]
+    param_m         : c[0x0][0x170]
+    param_n         : c[0x0][0x174]
+    param_k         : c[0x0][0x178]
+    param_ldaz      : c[0x0][0x17c]
+    param_ldbz      : c[0x0][0x180]
+    param_ldcz      : c[0x0][0x184]
+    param_loops     : c[0x0][0x188]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    64-95   ~ lda, ldb, ldaz, ldbz, tid1, ta, tb, tid7, tid15, tidX, blk, txa64, xmad_tb, tid, blkA, blkB, blkZ
+
+    0-63    : czero<00-63>
+
+     3, 2,11,10,19,18,27,26 : cx<0-7>y0
+     7, 6,15,14,23,22,31,30 : cx<0-7>y1
+     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
+     5, 4,13,12,21,20,29,28 : cx<0-7>y3
+    35,34,43,42,51,50,59,58 : cx<0-7>y4
+    39,38,47,46,55,54,63,62 : cx<0-7>y5
+    33,32,41,40,49,48,57,56 : cx<0-7>y6
+    37,36,45,44,53,52,61,60 : cx<0-7>y7
+
+    64-95   ~ x<1-3>, x<65-67>, y<1-3>
+
+    64-79   : j0Ay<0-7>, j0Bx<0-7>
+    80-95   : j1Ay<0-7>, j1Bx<0-7>
+
+     96-107 : loadA<0-7>,  loadB<0-3>
+    108-111 : trackA<0-1>, trackB<0-1>
+
+    112-125 ~ writeAs, writeBs, k, tidY, txa, txb, swapBuf
+    126-127 ~ readAs, readBs
+
+    64-75   : c<0-7>, d3, d2, d1, d0
+    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    86-125  ~ tid_2, blockA, blockB, blockZ, ldc, ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, ci, xmad_c, alpha, beta, flags, tid31, tid96
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+--:-:4:-:1      S2R blkZ, SR_CTAID.X;
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k, param_k;
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
+</CODE>
+
+--:-:-:-:1      LOP.AND tid1,  tid, 1;
+01:-:-:-:1      LOP.AND tid15, tid, 15;
+
+// tidX = (tid & 15) << 2
+// tidY = (tid >> 4) & 7
+--:-:-:-:1      SHL     tidX, tid15, 2;
+--:-:-:-:1      BFE.U32 tidY, tid,   0x304; // 3 bits at position 4
+
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 5;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+
+
+// trackA += (blkA*128 + lda*tidY + tidX + ldaz*blkZ) * 4
+02:-:-:-:1      ISCADD   txa, blkA, tidX,  7;
+--:-:-:-:1      XMAD.LO2 ta,  lda,  tidY, txa;
+08:-:-:-:1      XMAD.LO2 ta,  ldaz, blkZ, ta;
+--:-:-:-:1      LEA      trackA0.CC, ta, param_A[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackA1,    ta, param_A[1], RZ, 0x2;
+
+--:-:-:-:1      IADD txa64, txa, 64;
+--:-:-:-:1      ISETP.LT.AND P4, PT, txa,   param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa64, param_m, PT;
+
+// trackB += (blkB*64 + tidX + ldb*tidY + ldbz*blkZ) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidX, 6;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ, tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+// Start the write buffers high
+// writeAs = (128*tidY + tidX) * 4
+--:-:-:-:1      ISCADD writeAs, tidY, tidX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2;
+// writeBs = (64*tidY + tidX) * 4
+--:-:-:-:1      ISCADD writeBs, tidY, tidX, 6;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2;
+
+// Start the read buffers low
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<64*8 + 128*8>;
+</SCHEDULE_BLOCK>
+
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+
+// doLoad = tidY < k && txa|txb < n|m
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidY, k, P6;
+
+<ORDERED>
+--:-:2:-:1  @P1 LDG.E.CI.128 loadA0, [trackA + 4x< 0>];
+--:-:3:-:1  @P2 LDG.E.CI.128 loadA4, [trackA + 4x<64>];
+--:-:4:-:1  @P3 LDG.E.CI.128 loadB0, [trackB];
+
+--:-:5:-:2 @!P1 LDS.U.128 loadA0, [addr_zero];
+--:-:5:-:2 @!P2 LDS.U.128 loadA4, [addr_zero];
+--:-:6:-:2 @!P3 LDS.U.128 loadB0, [addr_zero];
+</ORDERED>
+
+// bDoRemainder = k & 7 && k > 8
+--:-:-:-:1      LOP.AND.NZ P1, RZ, k, 7;
+
+    // Vec 4 and scalar loads
+    } : q{
+
+// doLoadA = tidY < k && txa < m
+// doLoadB = tidY < k && txb < n
+--:-:-:-:1      IADD x1,  txa, 1;
+--:-:-:-:1      IADD x2,  txa, 2;
+--:-:-:-:1      IADD x3,  txa, 3;
+--:-:-:-:1      IADD x65, txa, 65;
+--:-:-:-:1      IADD x66, txa, 66;
+--:-:-:-:1      IADD x67, txa, 67;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_m, P0;
+
+--:-:2:-:1  @P0 LDG.E.CI loadA0, [trackA + 4x<0>];
+--:-:2:-:1  @P1 LDG.E.CI loadA1, [trackA + 4x<1>];
+--:-:2:-:1  @P2 LDG.E.CI loadA2, [trackA + 4x<2>];
+--:-:2:-:1  @P3 LDG.E.CI loadA3, [trackA + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadA0, RZ;
+--:-:-:-:1 @!P1 MOV loadA1, RZ;
+--:-:-:-:1 @!P2 MOV loadA2, RZ;
+--:-:-:-:1 @!P3 MOV loadA3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, x65, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, x66, param_m, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, x67, param_m, P0;
+
+--:-:3:-:1  @P0 LDG.E.CI loadA4, [trackA + 4x<64>];
+--:-:3:-:1  @P1 LDG.E.CI loadA5, [trackA + 4x<65>];
+--:-:3:-:1  @P2 LDG.E.CI loadA6, [trackA + 4x<66>];
+--:-:3:-:1  @P3 LDG.E.CI loadA7, [trackA + 4x<67>];
+
+--:-:-:-:1 @!P0 MOV loadA4, RZ;
+--:-:-:-:1 @!P1 MOV loadA5, RZ;
+--:-:-:-:1 @!P2 MOV loadA6, RZ;
+--:-:-:-:1 @!P3 MOV loadA7, RZ;
+
+--:-:-:-:1      IADD y1, txb, 1;
+--:-:-:-:1      IADD y2, txb, 2;
+--:-:-:-:1      IADD y3, txb, 3;
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidY, k, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, y1, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P2, PT, y2, param_n, P0;
+--:-:-:-:1      ISETP.LT.AND P3, PT, y3, param_n, P0;
+
+--:-:4:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
+--:-:4:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
+--:-:4:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
+--:-:4:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];
+
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+    };
+</CODE>
+
+</SCHEDULE_BLOCK>
+
+12:-:-:-:1      STS.128 [writeAs + 4x< 0>], loadA0;
+04:-:-:-:1      STS.128 [writeAs + 4x<64>], loadA4;
+
+--:-:-:-:6      IADD   trackA0.CC, trackA0, param_lda8;
+--:-:-:-:0      IADD.X trackA1,    trackA1, RZ;
+
+28:-:-:-:1      STS.128 [writeBs], loadB0;
+
+--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
+--:-:-:-:1      IADD.X trackB1,    trackB1, RZ;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, P1;
+    } : q{
+--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    my $k_end = $vec ? 16 : 24;
+    our @top = ("--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P4;\n");
+
+    our %insert =
+    (
+        j0c1  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P5;\n",
+        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, P6;\n",
+
+        ($vec ?
+            (
+        j0c13 => "--:-:2:-:1  \@P2 LDG.E.CI.128 loadA0, [trackA + 4x< 0>];\n",
+        j0c15 => "--:-:3:-:1  \@P3 LDG.E.CI.128 loadA4, [trackA + 4x<64>];\n",
+        j0c33 => "--:-:4:-:1  \@P0 LDG.E.CI.128 loadB0, [trackB];\n",
+            ) :
+            (
+        j0c10 => "--:-:2:-:1  \@P2 LDG.E.CI loadA0, [trackA + 4x<0>];\n",
+        j0c12 => "--:-:2:-:1  \@P2 LDG.E.CI loadA1, [trackA + 4x<1>];\n",
+        j0c14 => "--:-:2:-:1  \@P2 LDG.E.CI loadA2, [trackA + 4x<2>];\n",
+        j0c16 => "--:-:2:-:1  \@P2 LDG.E.CI loadA3, [trackA + 4x<3>];\n",
+
+        j0c29 => "--:-:3:-:1  \@P3 LDG.E.CI loadA4, [trackA + 4x<64>];\n",
+        j0c31 => "--:-:3:-:1  \@P3 LDG.E.CI loadA5, [trackA + 4x<65>];\n",
+        j0c33 => "--:-:3:-:1  \@P3 LDG.E.CI loadA6, [trackA + 4x<66>];\n",
+        j0c35 => "--:-:3:-:1  \@P3 LDG.E.CI loadA7, [trackA + 4x<67>];\n",
+
+        j1c29 => "--:-:4:-:1  \@P0 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
+        j1c31 => "--:-:4:-:1  \@P0 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
+        j1c33 => "--:-:4:-:1  \@P0 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
+        j1c35 => "--:-:4:-:1  \@P0 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j1c37 => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
+
+        j1c39 => "--:-:-:-:1      IADD32I k, k, -8;\n",
+
+        j5c31 => "02:-:-:-:1  \@P0 STS.128 [writeAs + 4x< 0>], loadA0;\n",
+        j5c33 => "04:-:-:-:1  \@P0 STS.128 [writeAs + 4x<64>], loadA4;\n",
+
+        j5c46 => "--:-:-:-:1  \@P0 IADD   trackA0.CC, trackA0, param_lda8;\n",
+        j5c54 => "--:-:-:-:1  \@P0 IADD.X trackA1,    trackA1, RZ;\n",
+
+        j6c39 => "08:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",
+
+        j6c46 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
+        j6c54 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",
+
+        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                 "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
+    );
+    return;
+</CODE>
+
+<INCLUDE file="sgemm_common_128x64.sass"/>
diff --git a/Kernel/SGEMM/Pascal/sgemm_tn_rnn_bprop_128x32.sass b/Kernel/SGEMM/Pascal/sgemm_tn_rnn_bprop_128x32.sass
new file mode 100644
index 0000000..3db4612
--- /dev/null
+++ b/Kernel/SGEMM/Pascal/sgemm_tn_rnn_bprop_128x32.sass
@@ -0,0 +1,476 @@
+# Kernel: sgemm_tn_128x32
+
+# Copyright 2014 Nervana Systems Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+<CONSTANT_MAPPING>
+    addr_zero : 4x<128*16*2 + 32*16*2>
+    szShareA  : 128*16
+    szShareB  : 32*16
+
+    gridDimA : c[0x0][0x14]
+    gridDimB : c[0x0][0x18]
+
+    param_C[0]        : c[0x0][0x140]
+    param_C[1]        : c[0x0][0x144]
+    param_A[0]        : c[0x0][0x148]
+    param_A[1]        : c[0x0][0x14c]
+    param_B[0]        : c[0x0][0x150]
+    param_B[1]        : c[0x0][0x154]
+    param_H[0]        : c[0x0][0x158]
+    param_H[1]        : c[0x0][0x15c]
+    param_lockAddr[0] : c[0x0][0x160]
+    param_lockAddr[1] : c[0x0][0x164]
+    param_alpha       : c[0x0][0x168]
+    param_beta        : c[0x0][0x16c]
+    param_xcutoff     : c[0x0][0x170]
+    param_flags       : c[0x0][0x174]
+    param_lda8        : c[0x0][0x178]
+    param_ldb8        : c[0x0][0x17c]
+    param_ldc         : c[0x0][0x180]
+    param_ldh         : c[0x0][0x184]
+    param_m           : c[0x0][0x188]
+    param_n           : c[0x0][0x18c]
+    param_k           : c[0x0][0x190]
+    param_ldaz        : c[0x0][0x194]
+    param_ldbz        : c[0x0][0x198]
+    param_ldcz        : c[0x0][0x19c]
+    param_loops       : c[0x0][0x1a0]
+    param_dimB        : c[0x0][0x1a4]
+    param_dimC        : c[0x0][0x1a8]
+    param_dimH        : c[0x0][0x1ac]
+    param_unrolling   : c[0x0][0x1b0]
+    param_numBlks     : c[0x0][0x1b4]
+    param_numAblks    : c[0x0][0x1b8]
+</CONSTANT_MAPPING>
+
+<REGISTER_MAPPING>
+
+    32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>, offsetB
+    80-81 : baseB<0-1>
+
+    0-31 : czero<00-31>
+
+     3, 2,11,10 : cx<0-3>y0
+     7, 6,15,14 : cx<0-3>y1
+     1, 0, 9, 8 : cx<0-3>y2
+     5, 4,13,12 : cx<0-3>y3
+    19,18,27,26 : cx<0-3>y4
+    23,22,31,30 : cx<0-3>y5
+    17,16,25,24 : cx<0-3>y6
+    21,20,29,28 : cx<0-3>y7
+
+      32-43 : j0Ay<0-7>, j0Bx<0-3>
+      44-55 : j1Ay<0-7>, j1Bx<0-3>
+      56-67 : j2Ay<0-7>, j2Bx<0-3>
+      68-79 : j3Ay<0-7>, j3Bx<0-3>
+
+      80-83 : loadB<0-3>
+      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>
+
+    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>
+
+    110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb
+    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, time_step
+
+    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
+    40-47 : c<0-3>, d3, d2, d1, d0
+    48-63 : H00y<0-1>, H04y<0-1>, H08y<0-1>, H12y<0-1>, h0, h1, h2, h3, baseC<0-1>, baseH<0-1>
+    64-68 : blkId, nextBlk, lockAddr<0-1>, lockVal
+   69-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags, xcutoff, offsetC, offsetH, numBlk, predSave, ldh1, ldh4, ldh60
+
+</REGISTER_MAPPING>
+
+--:-:1:-:1      S2R tid,  SR_TID.X;
+--:-:2:-:1      S2R blkA, SR_CTAID.Y;
+--:-:3:-:1      S2R blkB, SR_CTAID.Z;
+
+--:-:-:-:1      MOV time_step, RZ;
+--:-:-:-:1      MOV flags, param_flags;
+
+RNN_LOOP:
+
+<SCHEDULE_BLOCK>
+--:-:-:-:1      MOV k,    param_k;
+--:-:-:-:1      MOV lda,  param_lda8;
+--:-:-:-:1      MOV ldb,  param_ldb8;
+--:-:-:-:1      SHR.U32 lda, lda, 5;
+--:-:-:-:1      SHR.U32 ldb, ldb, 5;
+--:-:-:-:1      MOV ldaz, param_ldaz;
+--:-:-:-:1      MOV ldbz, param_ldbz;
+--:-:-:-:1      SHL lda16, lda, 6;
+--:-:-:-:1      SHL ldb16, ldb, 6;
+--:-:-:-:1      SHL lda4,  lda, 2;
+
+--:-:-:-:1      STS.128 [addr_zero], RZ;
+<CODE>
+    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
+</CODE>
+
+--:-:-:-:6      LOP.AND.NZ   P0, RZ, flags, 4;
+--:-:-:-:6  @P0 IADD offsetB, -time_step, param_unrolling;
+--:-:-:-:6  @P0 IADD offsetB, offsetB,    -1;
+--:-:-:-:6 @!P0 MOV  offsetB, time_step;
+
+// baseB = param_B + dimB * time_step
+--:-:-:-:1      XMAD     offsetB,   offsetB,   param_dimB, RZ;
+--:-:-:-:1      LEA      baseB0.CC, offsetB,   param_B[0],     2;
+--:-:-:-:1      LEA.HI.X baseB1,    offsetB,   param_B[1], RZ, 2;
+
+// tidAX = (tid & 31) << 2
+// tidAY = (tid >> 5)
+01:-:-:-:1      LOP.AND tidAX, tid,   31;
+--:-:-:-:1      SHL     tidAX, tidAX, 2;
+--:-:-:-:1      SHR.U32 tidAY, tid,   5;
+
+// tidBX = (tid & 7) << 2
+// tidBY = (tid >> 3)
+01:-:-:-:1      LOP.AND tidBX, tid,   7;
+--:-:-:-:1      SHL     tidBX, tidBX, 2;
+--:-:-:-:1      SHR.U32 tidBY, tid,   3;
+
+// trackA += (blkA*128 + tidAX + lda*tidAY) * 4
+02:-:-:-:1      ISCADD   txa, blkA, tidAX, 7;
+--:-:-:-:1      XMAD.LO2 ta0, lda,  tidAY, txa;
+08:-:-:-:1      XMAD.LO2 ta0, ldaz, RZ,    ta0;
+--:-:-:-:1      IADD     ta1, ta0, lda4;
+--:-:-:-:1      IADD     ta2, ta1, lda4;
+--:-:-:-:1      IADD     ta3, ta2, lda4;
+
+--:-:-:-:1      LEA      track0A0.CC, ta0, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track0A1,    ta0, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track1A0.CC, ta1, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track1A1,    ta1, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track2A0.CC, ta2, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track2A1,    ta2, param_A[1], RZ, 2;
+--:-:-:-:1      LEA      track3A0.CC, ta3, param_A[0],     2;
+--:-:-:-:1      LEA.HI.X track3A1,    ta3, param_A[1], RZ, 2;
+
+// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4
+04:-:-:-:1      ISCADD   txb, blkB, tidBX,  5;
+--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
+08:-:-:-:1      XMAD.LO2 tb,  ldbz, RZ,    tb;
+--:-:-:-:1      LEA      trackB0.CC, tb, baseB0,     2;
+--:-:-:-:1      LEA.HI.X trackB1,    tb, baseB1, RZ, 2;
+
+// writeAs = (tidAY*128 + tidAX) * 4
+--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
+--:-:-:-:1      ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;
+
+// writeBs = (tidBY*32 + tidBX) * 4
+--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 5;
+--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;
+
+// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
+--:-:-:-:1      LOP.AND tid1,   tid,    1;
+--:-:-:-:1      LOP.AND readAs, tid,    0x70;
+--:-:-:-:1      SHR.U32 readAs, readAs, 3;
+--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
+--:-:-:-:1      SHL     readAs, readAs, 4;
+// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
+--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
+--:-:-:-:1      ISCADD  readBs, readBs, 4x<szShareA>, 4;
+
+--:-:-:-:1      MOV32I swapBuf, -4x<szShareA + szShareB>;
+</SCHEDULE_BLOCK>
+
+REMAINDER:
+
+<SCHEDULE_BLOCK>
+
+--:-:-:-:1      IADD tidAY1, tidAY, 4;
+--:-:-:-:1      IADD tidAY2, tidAY, 8;
+--:-:-:-:1      IADD tidAY3, tidAY, 12;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+
+--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY,  k, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, tidAY1, k, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY2, k, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY3, k, P5;
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY,  k, P6;
+
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E.128 load0A, [track0A];
+--:-:2:-:1  @P1 LDG.E.128 load1A, [track1A];
+--:-:3:-:1  @P2 LDG.E.128 load2A, [track2A];
+--:-:4:-:1  @P3 LDG.E.128 load3A, [track3A];
+--:-:5:-:1  @P4 LDG.E.128 loadB,  [trackB];
+</ORDERED>
+
+<ORDERED>
+--:-:6:-:1 @!P0 LDS.U.128 load0A, [addr_zero];
+--:-:6:-:1 @!P1 LDS.U.128 load1A, [addr_zero];
+--:-:6:-:1 @!P2 LDS.U.128 load2A, [addr_zero];
+--:-:6:-:1 @!P3 LDS.U.128 load3A, [addr_zero];
+--:-:6:-:2 @!P4 LDS.U.128 loadB,  [addr_zero];
+</ORDERED>
+
+    } : q{
+
+--:-:-:-:1      IADD txa1,  txa,  1;
+--:-:-:-:1      IADD txa2,  txa,  2;
+--:-:-:-:1      IADD txa3,  txa,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P4;
+<ORDERED>
+--:-:1:-:1  @P0 LDG.E load0A0, [track0A + 4x<0>];
+--:-:1:-:1  @P1 LDG.E load0A1, [track0A + 4x<1>];
+--:-:1:-:1  @P2 LDG.E load0A2, [track0A + 4x<2>];
+--:-:1:-:1  @P3 LDG.E load0A3, [track0A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load0A0, RZ;
+--:-:-:-:1 @!P1 MOV load0A1, RZ;
+--:-:-:-:1 @!P2 MOV load0A2, RZ;
+--:-:-:-:1 @!P3 MOV load0A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY1, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:2:-:1  @P0 LDG.E load1A0, [track1A + 4x<0>];
+--:-:2:-:1  @P1 LDG.E load1A1, [track1A + 4x<1>];
+--:-:2:-:1  @P2 LDG.E load1A2, [track1A + 4x<2>];
+--:-:2:-:1  @P3 LDG.E load1A3, [track1A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load1A0, RZ;
+--:-:-:-:1 @!P1 MOV load1A1, RZ;
+--:-:-:-:1 @!P2 MOV load1A2, RZ;
+--:-:-:-:1 @!P3 MOV load1A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY2, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P6;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P6;
+<ORDERED>
+--:-:3:-:1  @P0 LDG.E load2A0, [track2A + 4x<0>];
+--:-:3:-:1  @P1 LDG.E load2A1, [track2A + 4x<1>];
+--:-:3:-:1  @P2 LDG.E load2A2, [track2A + 4x<2>];
+--:-:3:-:1  @P3 LDG.E load2A3, [track2A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load2A0, RZ;
+--:-:-:-:1 @!P1 MOV load2A1, RZ;
+--:-:-:-:1 @!P2 MOV load2A2, RZ;
+--:-:-:-:1 @!P3 MOV load2A3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY3, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txa,  param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txa1, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txa2, param_m, P5;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txa3, param_m, P5;
+<ORDERED>
+--:-:4:-:1  @P0 LDG.E load3A0, [track3A + 4x<0>];
+--:-:4:-:1  @P1 LDG.E load3A1, [track3A + 4x<1>];
+--:-:4:-:1  @P2 LDG.E load3A2, [track3A + 4x<2>];
+--:-:4:-:1  @P3 LDG.E load3A3, [track3A + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV load3A0, RZ;
+--:-:-:-:1 @!P1 MOV load3A1, RZ;
+--:-:-:-:1 @!P2 MOV load3A2, RZ;
+--:-:-:-:1 @!P3 MOV load3A3, RZ;
+
+--:-:-:-:1      IADD txb1,  txb,  1;
+--:-:-:-:1      IADD txb2,  txb,  2;
+--:-:-:-:1      IADD txb3,  txb,  3;
+
+--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY, k, PT;
+--:-:-:-:1      ISETP.LT.AND P0, PT, txb,  param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P1, PT, txb1, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P2, PT, txb2, param_n, P4;
+--:-:-:-:1      ISETP.LT.AND P3, PT, txb3, param_n, P4;
+<ORDERED>
+--:-:5:-:1  @P0 LDG.E loadB0, [trackB + 4x<0>];
+--:-:5:-:1  @P1 LDG.E loadB1, [trackB + 4x<1>];
+--:-:5:-:1  @P2 LDG.E loadB2, [trackB + 4x<2>];
+--:-:5:-:1  @P3 LDG.E loadB3, [trackB + 4x<3>];
+</ORDERED>
+--:-:-:-:1 @!P0 MOV loadB0, RZ;
+--:-:-:-:1 @!P1 MOV loadB1, RZ;
+--:-:-:-:1 @!P2 MOV loadB2, RZ;
+--:-:-:-:1 @!P3 MOV loadB3, RZ;
+
+--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;
+--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;
+    };
+</CODE>
+
+--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;
+--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;
+
+// bDoRemainder = k & 15 && k > 16
+--:-:-:-:0      LOP.AND.NZ P1, RZ, k, 15;
+
+</SCHEDULE_BLOCK>
+
+21:-:-:-:1      STS.128 [writeAs + 4x<0*128>], load0A;
+--:-:-:-:6      IADD   track0A0.CC, track0A0, lda16;
+--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;
+
+02:-:-:-:1      STS.128 [writeAs + 4x<4*128>], load1A;
+--:-:-:-:6      IADD   track1A0.CC, track1A0, lda16;
+--:-:-:-:0      IADD.X track1A1,    track1A1, RZ;
+
+04:-:-:-:1      STS.128 [writeAs + 4x<8*128>], load2A;
+--:-:-:-:6      IADD   track2A0.CC, track2A0, lda16;
+--:-:-:-:0      IADD.X track2A1,    track2A1, RZ;
+
+08:-:-:-:1      STS.128 [writeAs + 4x<12*128>], load3A;
+--:-:-:-:6      IADD   track3A0.CC, track3A0, lda16;
+--:-:-:-:0      IADD.X track3A1,    track3A1, RZ;
+
+10:-:-:-:1      STS.128 [writeBs], loadB;
+--:-:-:-:1      IADD   trackB0.CC, trackB0, ldb16;
+
+--:-:-:-:1      ISETP.GT.AND P1, PT, k, 16, P1;
+
+--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
+--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
+--:-:-:-:5      BAR.SYNC 0;
+--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
+--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
+--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;
+
+--:-:-:-:0      IADD.X trackB1,    trackB1, RZ;
+
+<CODE>
+    our $vec;
+    return $vec ? q{
+--:-:3:-:1  @P5 LDG.E.128 load0A, [track0A];
+--:-:4:-:1  @P5 LDG.E.128 load1A, [track1A];
+--:-:5:-:1  @P5 LDG.E.128 load2A, [track2A];
+--:-:5:-:1  @P5 LDG.E.128 load3A, [track3A];
+--:-:6:-:1  @P6 LDG.E.128 loadB,  [trackB];
+    } : q{
+--:-:3:-:1  @P5 LDG.E load0A0, [track0A + 4x<0>];
+--:-:3:-:1  @P5 LDG.E load0A1, [track0A + 4x<1>];
+--:-:3:-:1  @P5 LDG.E load0A2, [track0A + 4x<2>];
+--:-:3:-:1  @P5 LDG.E load0A3, [track0A + 4x<3>];
+
+--:-:4:-:1  @P5 LDG.E load1A0, [track1A + 4x<0>];
+--:-:4:-:1  @P5 LDG.E load1A1, [track1A + 4x<1>];
+--:-:4:-:1  @P5 LDG.E load1A2, [track1A + 4x<2>];
+--:-:4:-:1  @P5 LDG.E load1A3, [track1A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E load2A0, [track2A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E load2A1, [track2A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E load2A2, [track2A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E load2A3, [track2A + 4x<3>];
+
+--:-:5:-:1  @P5 LDG.E load3A0, [track3A + 4x<0>];
+--:-:5:-:1  @P5 LDG.E load3A1, [track3A + 4x<1>];
+--:-:5:-:1  @P5 LDG.E load3A2, [track3A + 4x<2>];
+--:-:5:-:1  @P5 LDG.E load3A3, [track3A + 4x<3>];
+
+--:-:6:-:1  @P6 LDG.E loadB0, [trackB + 4x<0>];
+--:-:6:-:1  @P6 LDG.E loadB1, [trackB + 4x<1>];
+--:-:6:-:1  @P6 LDG.E loadB2, [trackB + 4x<2>];
+--:-:6:-:1  @P6 LDG.E loadB3, [trackB + 4x<3>];
+    };
+</CODE>
+
+<CODE>
+    our $vec;
+    our $shiftAX = 0;
+    our $shiftBX = 0;
+    our %insert =
+    (
+        j0c6   => "--:-:-:-:1      IADD k, k, -16;\n",
+        j0c14  => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, 16, PT;\n",
+
+        j3c6   => "04:3:-:-:1  \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n",
+        j5c6   => "08:4:-:-:1  \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n",
+        j7c6   => "10:-:-:-:1  \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n",
+        j9c6   => "--:5:-:-:1  \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n",
+        j11c6  => "20:6:-:-:1  \@P0 STS.128 [writeBs], loadB;\n",
+
+        j3c7   => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, lda16;\n",
+        j3c13  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
+        j5c7   => "--:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, lda16;\n",
+        j5c13  => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",
+        j7c7   => "--:-:-:-:1  \@P5 IADD   track2A0.CC, track2A0, lda16;\n",
+        j7c13  => "--:-:-:-:1  \@P5 IADD.X track2A1,    track2A1, RZ;\n",
+        j9c7   => "--:-:-:-:1  \@P5 IADD   track3A0.CC, track3A0, lda16;\n",
+        j9c13  => "--:-:-:-:1  \@P5 IADD.X track3A1,    track3A1, RZ;\n",
+        j11c7  => "--:-:-:-:1  \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
+        j11c13 => "--:-:-:-:1  \@P6 IADD.X trackB1,     trackB1,  RZ;\n",
+
+        j3c14  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, 32, P2;\n",
+        j5c14  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, 32, P3;\n",
+        j9c14  => "--:-:-:-:1      ISETP.GE.AND P5, PT, k, 32, P5;\n",
+        j11c14 => "--:-:-:-:1      ISETP.GE.AND P6, PT, k, 32, P6;\n",
+
+        j13c31 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
+                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
+                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",
+
+        ($vec ?
+            (
+                j3c29  => "04:-:3:-:1  \@P2 LDG.E.128 load0A, [track0A];\n",
+                j5c29  => "08:-:4:-:1  \@P3 LDG.E.128 load1A, [track1A];\n",
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E.128 load2A, [track2A];\n",
+                j9c31  => "--:-:5:-:1  \@P5 LDG.E.128 load3A, [track3A];\n",
+                j11c29 => "20:-:6:-:1  \@P6 LDG.E.128 loadB,  [trackB];\n",
+            ) :
+            (
+                j3c29  => "04:-:-:-:1  \@P2 LDG.E load0A0, [track0A + 4x<0>];\n",
+                j3c31  => "--:-:-:-:1  \@P2 LDG.E load0A1, [track0A + 4x<1>];\n",
+                j4c1   => "--:-:-:-:1  \@P2 LDG.E load0A2, [track0A + 4x<2>];\n",
+                j4c3   => "--:-:3:-:1  \@P2 LDG.E load0A3, [track0A + 4x<3>];\n",
+
+                j5c29  => "08:-:-:-:1  \@P3 LDG.E load1A0, [track1A + 4x<0>];\n",
+                j5c31  => "--:-:-:-:1  \@P3 LDG.E load1A1, [track1A + 4x<1>];\n",
+                j6c1   => "--:-:-:-:1  \@P3 LDG.E load1A2, [track1A + 4x<2>];\n",
+                j6c3   => "--:-:4:-:1  \@P3 LDG.E load1A3, [track1A + 4x<3>];\n",
+
+                j9c29  => "10:-:-:-:1  \@P5 LDG.E load2A0, [track2A + 4x<0>];\n",
+                j9c31  => "--:-:-:-:1  \@P5 LDG.E load2A1, [track2A + 4x<1>];\n",
+                j10c1  => "--:-:-:-:1  \@P5 LDG.E load2A2, [track2A + 4x<2>];\n",
+                j10c3  => "--:-:-:-:1  \@P5 LDG.E load2A3, [track2A + 4x<3>];\n",
+
+                j10c8  => "--:-:-:-:1  \@P5 LDG.E load3A0, [track3A + 4x<0>];\n",
+                j10c10 => "--:-:-:-:1  \@P5 LDG.E load3A1, [track3A + 4x<1>];\n",
+                j10c12 => "--:-:-:-:1  \@P5 LDG.E load3A2, [track3A + 4x<2>];\n",
+                j10c14 => "--:-:5:-:1  \@P5 LDG.E load3A3, [track3A + 4x<3>];\n",
+
+                j11c29 => "20:-:-:-:1  \@P6 LDG.E loadB0, [trackB + 4x<0>];\n",
+                j11c31 => "--:-:-:-:1  \@P6 LDG.E loadB1, [trackB + 4x<1>];\n",
+                j12c1  => "--:-:-:-:1  \@P6 LDG.E loadB2, [trackB + 4x<2>];\n",
+                j12c3  => "--:-:6:-:1  \@P6 LDG.E loadB3, [trackB + 4x<3>];\n",
+            )
+        ),
+
+        j15c31 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
+                  "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n"
+    );
+    return '';
+</CODE>
+
+<INCLUDE file="sgemm_rnn_bprop_common_128x32.sass"/>
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..34711e0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,9 @@
+# DeepPerf
+
+DeepPerf is developed to understand GPU microarchitectural features and improve performance for compute-intensive kernels. The methodology relies on a reverse engineering approach to crack the GPU ISA encodings in order to build a GPU assembler. An assembly microbenchmark suite correlates microarchitectural features with their performance factors to uncover instruction-level and memory hierarchy preferences.
+We use SGEMM and Convolution as examples to show the ways to achieve bare-metal performance tuning. In your deep learning framework, you could use directly these sass code to speed up the performance.
+
+The toolchain is an attempt to automatically crack different GPU ISA encodings and build an assembler adaptively for the purpose of performance enhancements to applications on GPUs.
+There are three directories in this folder, which consists of three major steps to optimize a cuda code in the assembly level. All the tools cover three recent NVIDIA GPU architecture, Kepler, Maxwell and Pascal.
+
+
diff --git a/Solver/.gitignore b/Solver/.gitignore
new file mode 100644
index 0000000..1a8323b
--- /dev/null
+++ b/Solver/.gitignore
@@ -0,0 +1,2 @@
+data/*
+output/*
diff --git a/Solver/README.md b/Solver/README.md
new file mode 100644
index 0000000..53a0f21
--- /dev/null
+++ b/Solver/README.md
@@ -0,0 +1,32 @@
+
+# Cracking GPU ISA Encodings
+
+## Output
+
+* Bit positions of opcodes
+* Bit positions of operands for different operand type
+* Bit positions of modifiers for each instruction
+
+## How to run the workflow?
+
+The workflow is composed of four stages:
+
+1. Generate PTX code->`./bin/generate_disassemble [arch]`
+    * Generate PTX code (.ptx) in ptxgen directory and compile PTX to cubin; 
+    * Disassemble cubins to sass files, which feed into the following three solvers;
+    * Each line of sass files looks like this:
+    
+    `/∗0048∗/ IADD R0, R2, R0; /∗0x4800000000201c03∗/`
+    
+2. Opcode solver->`./bin/opcode [arch]`
+    * Probe 64-bit binary code of sass files by flipping each bit and observe whether opcodes change;
+    
+3. Modifer solver->`./bin/modifier [arch]`
+    * Probe 64-bit binary code of sass files by flipping each bit and observe whether modifiers change;
+    * Enuermerate bits on all modifier positions to generate all the modifiers;
+    
+4. Operand solver->`./bin/operand [arch]`
+    * Probe 64-bit binary code of sass files by flipping each bit and observe whether operands change;
+    * Operand type: R: Register, S: Special Register, I: Immediate, C: constant[][], M: Memory, P: Predicate;
+
+5. Allowed values for `[arch]` options:  'sm_30','sm_32','sm_35','sm_37','sm_50','sm_52','sm_53','sm_60','sm_61','sm_62'.
diff --git a/Solver/bin/generate_disassemble b/Solver/bin/generate_disassemble
new file mode 100755
index 0000000..e37b189
--- /dev/null
+++ b/Solver/bin/generate_disassemble
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Please input architecture parameter argv[1]
+if [ "$#" -lt 1 ]
+then
+    echo "Please input architecture parameter argv[1]"
+    exit -1
+fi
+
+arch=$1
+prefix="data/"$arch"/"
+src_directory="src/"
+
+echo "Arch: "$arch
+echo "Data directory: "$prefix
+
+# 1. Generate ptx
+ptx_directory=$prefix"ptx/"
+mkdir -p $ptx_directory
+echo ".................................................................."
+echo "1. Generate .ptx files to "$ptx_directory" directory"
+echo "It may take serveral miniutes"
+echo ".................................................................."
+perl $src_directory"ptxgen.pl" $arch $ptx_directory
+
+# 2. Compile to cubins
+cubin_directory=$prefix"cubin/"
+mkdir -p $cubin_directory
+echo ".................................................................."
+echo "2. Compile .ptx file to cubin files in "$cubin_directory" directory"
+echo "It may take serveral miniutes"
+echo ".................................................................."
+ptx=$ptx_directory"*.ptx"
+for p in $ptx
+do
+     f=`echo $p | cut -d / -f 4 |cut -d . -f 1` 
+     fout=$cubin_directory""$f".cubin"
+     echo $fout
+     ptxas -arch $arch -m 64 $p -o $fout > /dev/null 2>&1
+done
+
+# 3. Disassembly to sass
+asm_directory=$prefix"asm/"
+mkdir -p $asm_directory
+echo ".................................................................."
+echo "3. Disassemble .cubin file to sass files in "$asm_directory" directory"
+echo "It may take serveral miniutes"
+echo ".................................................................."
+cubin=$cubin_directory"*.cubin"
+for p in $cubin
+do
+     f=`echo $p | cut -d / -f 4 | cut -d . -f 1`
+     fout=$asm_directory""$f".sass"
+     echo $fout
+     cuobjdump --gpu-architecture $arch --dump-sass $p > $fout
+done
+
+# 4.Put all sass results in one file
+echo ".................................................................."
+echo "4. Gathering results from ptxgen"
+echo ".................................................................."
+asm=$asm_directory"*.sass"
+if [ -f /tmp/all.sass ]
+then
+    rm /tmp/all.sass
+else
+    touch /tmp/all.sass
+fi
+
+for f in $asm
+do
+    cat $f >> /tmp/all.sass
+done
+
+# Ignore non-instruction lines
+awk '{if (NF >= 5) {$1 = ""; print $0} }' /tmp/all.sass > /tmp/all_inst.sass
+# Make instruction uniq
+python $src_directory"unique.py" /tmp/all_inst.sass > $prefix""$arch".sass"
+# Generate test cubin
+nvcc -cubin -arch $arch $src_directory"test.cu" -o $prefix""$arch".cubin"
+
+rm /tmp/all.sass /tmp/all_inst.sass
diff --git a/Solver/bin/modifier b/Solver/bin/modifier
new file mode 100755
index 0000000..68bdea3
--- /dev/null
+++ b/Solver/bin/modifier
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Please input architecture parameter argv[1]
+if [ "$#" -lt 1 ]
+then
+    echo "Please input architecture parameter argv[1]"
+    exit -1
+fi
+
+arch=$1
+prefix="data/"$arch"/"
+src_directory="src/"
+asm_directory=$prefix
+output_directory="output/"$arch"/"
+output_file=$output_directory""$arch".modifier"
+mkdir -p $output_directory
+rm -rf $output_file || true
+echo "Output file: "$output_file
+python $src_directory"modifier.py" $asm_directory""$arch".sass" $arch $output_file
diff --git a/Solver/bin/opcode b/Solver/bin/opcode
new file mode 100755
index 0000000..7f25fb1
--- /dev/null
+++ b/Solver/bin/opcode
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Please input architecture parameter argv[1]
+if [ "$#" -lt 1 ]
+then
+    echo "Please input architecture parameter argv[1]"
+    exit -1
+fi
+
+arch=$1
+prefix="data/"$arch"/"
+src_directory="src/"
+asm_directory=$prefix
+output_directory="output/"$arch"/"
+output_file=$output_directory""$arch".opcode"
+mkdir -p $output_directory
+rm -rf $output_file || true
+echo "Output file: "$output_file
+python $src_directory"opcode.py" $asm_directory""$arch".sass" $arch $output_file
diff --git a/Solver/bin/operand b/Solver/bin/operand
new file mode 100755
index 0000000..5c6d9e4
--- /dev/null
+++ b/Solver/bin/operand
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Please input architecture parameter argv[1]
+if [ "$#" -lt 1 ]
+then
+    echo "Please input architecture parameter argv[1]"
+    exit -1
+fi
+
+arch=$1
+prefix="data/"$arch"/"
+src_directory="src/"
+asm_directory=$prefix
+output_directory="output/"$arch"/"
+output_file=$output_directory""$arch".operand"
+mkdir -p $output_directory
+rm -rf $output_file || true
+echo "Output file: "$output_file
+python $src_directory"operand.py" $asm_directory""$arch".sass" $arch $output_file
diff --git a/Solver/src/__init__.py b/Solver/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Solver/src/dumper.py b/Solver/src/dumper.py
new file mode 100644
index 0000000..934c012
--- /dev/null
+++ b/Solver/src/dumper.py
@@ -0,0 +1,27 @@
+import os
+import struct
+
+def arch2mode(arch):
+    return arch.replace("_", "").upper()
+
+def dump(newcode, arch):
+    version = arch.split("_")[1]
+    if version < 40:
+        tmp_bin = "/tmp/tmp_dumper.bin"
+        fout = open(tmp_bin, "wb")
+        fout.write(struct.pack("<Q", int(newcode, 16)))
+        fout.close()
+        cmd = "nvdisasm -b {0} {1} 2>&1".format(arch2mode(arch), tmp_bin)
+        tmp_read = os.popen(cmd).read()
+        rmfile = "rm {0}".format(tmp_bin)
+        os.system(rmfile)
+        return tmp_read
+    else:
+        tmp_cubin = "data/" + arch + "/" + arch + ".cubin"
+        f = open(tmp_cubin,'rb+')  
+        f.seek(904)
+        f.write(struct.pack('Q', int(newcode, 16)))
+        f.close()
+        cmd = "cuobjdump -arch {0} -sass {1} 2>&1".format(arch, tmp_cubin)
+        tmp_read = os.popen(cmd).read()
+        return tmp_read
diff --git a/Solver/src/enumerator.py b/Solver/src/enumerator.py
new file mode 100644
index 0000000..1d98969
--- /dev/null
+++ b/Solver/src/enumerator.py
@@ -0,0 +1,21 @@
+from dumper import dump
+import logging
+
+def enumerate(base, pos, arch):
+    version = int(arch.split("_")[1])
+    for i in range(1 << len(pos)):
+        bits = 0x0
+        enc = base
+        # Expresss i in binary
+        for j in range(len(pos)):
+            bits = (((i >> j) & 0x1) << pos[j]) | bits
+            enc = enc & (~(1 << pos[j]))
+        dump_file = dump("0x{:016x}".format(enc | bits), arch)
+        if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1 and dump_file.find("INVALID") == -1:
+            line = dump_file.split("\n")
+            if version < 40:
+                line_inst = line[1].split();
+            else:
+                line_inst = line[5].split();
+            line_inst.pop(0)
+            logging.info("0b{:064b}".format(bits) + ": " + " ".join(line_inst))
diff --git a/Solver/src/inst.py b/Solver/src/inst.py
new file mode 100644
index 0000000..3ab66c6
--- /dev/null
+++ b/Solver/src/inst.py
@@ -0,0 +1,42 @@
+from sets import Set
+
+class Inst:
+    def __init__(self, inst, raw = True):
+        # Fetech binary encoding
+        if raw == True: # From cuobjdump
+            self.__enc = inst[-2]
+            inst.pop(-1)
+            inst.pop(-1)
+            inst.pop(-1)
+        else: # From nvdisasm
+            self.__enc = ""
+
+        if inst[0] == '{':  # Check dual issue
+            inst.pop(0)
+            self.__pred = ""
+        if inst[0].find('@') != -1:  # Check predicate, such as @P0
+            self.__pred = inst.pop(0)
+
+        # Remove semicolon of zero operand field instruction such as "RRO;" 
+        ops = inst.pop(0).replace(";", "")
+        # Fetech opcode
+        self.__op = ops.split(".")[0]
+        # Split opcode
+        self.__modifier = ops.split(".")[1:]
+        # Fetech operands and remove ; and ,
+        self.__operands = ' '.join(inst).replace(";", "").replace(",", "").replace("-","").replace("|","")
+
+    def op(self):
+        return str(self.__op)
+
+    def modifier(self):
+        return str(self.__modifier)
+
+    def enc(self):
+        return str(self.__enc)
+
+    def operands(self):
+        return str(self.__operands)
+
+    def pred(self):
+        return str(self.__pred)
diff --git a/Solver/src/modifier.py b/Solver/src/modifier.py
new file mode 100644
index 0000000..50bfbb7
--- /dev/null
+++ b/Solver/src/modifier.py
@@ -0,0 +1,55 @@
+from inst import Inst
+from dumper import dump
+import enumerator
+import sys
+import logging
+
+if __name__ == "__main__":
+    logging.basicConfig(filename = sys.argv[3], level = logging.INFO)
+    logging.debug("argv[1]: Disassemble file")
+    logging.debug("argv[2]: Arch")
+    logging.debug("argv[3]: Output file")
+    logging.debug("argv[4]: Instruction limit (default 100)")
+    sass = sys.argv[1]
+    arch = sys.argv[2]
+    if len(sys.argv) >= 5:
+        limit = sys.argv[4]
+    else:
+        limit = 100
+    count = 0
+    version = int(arch.split("_")[1])
+    with open(sass) as f:
+        for line in f:
+            pos = []
+            count += 1
+            if count == limit:
+                break
+            line_split = line.split()
+            # Construct instruction structure
+            origin = Inst(line_split)
+            # Find the 64-bit encodings
+            base = int(origin.enc(), 16)
+            # Bit by bit xor, observe whether opcode changes and guess what this bit represent
+            for i in range(0, 64):
+                mask = 2**i
+                newcode = base ^ mask
+                # Disassemble the new code
+                dump_file = dump("0x{:016x}".format(newcode), arch)
+                # Compare the disassemble to check which field changes: opcode, operand or modifer
+                if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1:
+                    line = dump_file.split("\n")
+                    if version < 40:
+                        line_inst = line[1].split();
+                    else:
+                        line_inst = line[5].split();
+                    # [0]: header info, [1] instruction part
+                    line_inst.pop(0)
+                    # Parse the new generated disassembly
+                    inst = Inst(line_inst, raw = version > 40)
+                    if inst.modifier() != origin.modifier() and inst.op() == origin.op():
+                        if i not in pos:
+                            pos.append(i)
+            # Enumerate all modifiers
+            if len(pos) > 0:
+                logging.info("%s modifier bits %s: ", origin.op(), pos);
+                enumerator.enumerate(base, pos, arch)
diff --git a/Solver/src/opcode.py b/Solver/src/opcode.py
new file mode 100644
index 0000000..c7113df
--- /dev/null
+++ b/Solver/src/opcode.py
@@ -0,0 +1,62 @@
+from inst import Inst
+from dumper import dump
+import sys
+import logging
+
+if __name__ == "__main__":
+    logging.basicConfig(filename = sys.argv[3], level = logging.INFO)
+    logging.debug("argv[1]: Disassemble file")
+    logging.debug("argv[2]: Arch")
+    logging.debug("argv[3]: Output file")
+    logging.debug("argv[4]: Instruction limit (default 100)")
+    sass = sys.argv[1]
+    arch = sys.argv[2]
+    if len(sys.argv) >= 5:
+        limit = sys.argv[4]
+    else:
+        limit = 100
+    count = 0;
+    version = int(arch.split("_")[1])
+    with open(sass) as f:
+        for line in f:
+            pos = []
+            bits = 0x0
+            count += 1
+            if count == limit:
+                break
+            line_split = line.split()
+            # Construct instruction structure
+            origin = Inst(line_split)
+            # Find the 64-bit encodings
+            base = int(origin.enc(), 16)
+            # Bit by bit xor, observe whether opcode changes and guess what this bit represent
+            for i in range(0, 64):
+                mask = 2**i
+                newcode = base ^ mask
+                # Disassemble the new code
+                dump_file = dump("0x{:016x}".format(newcode), arch)
+                # Compare the disassemble to check which field changes: opcode, operand or modifer
+                if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1:
+                    line = dump_file.split("\n")
+                    if version < 40:
+                        line_inst = line[1].split();
+                    else:
+                        line_inst = line[5].split();
+                    # [0]: header info, [1] instruction part
+                    line_inst.pop(0)
+                    # Parse the new generated disassembly
+                    inst = Inst(line_inst, raw = version > 40)
+                    # If opcode is changed, then this bit represent opcode, we find it!
+                    # LDG and TEX are the same instructions in fact 
+                    # RED and ATOM are the same instruction
+                    if inst.op() != origin.op() and not i in pos and not \
+                    (inst.op() == "LDG" and origin.op() == "TEX") and not \
+                    (inst.op() == "TEX" and origin.op() =="LDG") and not \
+                    (inst.op() == "RED" and origin.op() == "ATOM") and not \
+                    (inst.op() == "ATOM" and origin.op() == "RED"):
+                        logging.info("Opcode changes: %s => %s when bit [%d] is flipped from [%d]", \
+                            origin.op(), inst.op(), i, (base >> i) & 0x1)
+                        bits = bits | (((base >> i) & 0x1) << i)
+                        pos.append(i)
+            if len(pos) > 0:
+                logging.info("0b{:064b}".format(bits) + ": %s opcode bits %s: ", origin.op(), pos);
diff --git a/Solver/src/operand.py b/Solver/src/operand.py
new file mode 100644
index 0000000..6b7d12a
--- /dev/null
+++ b/Solver/src/operand.py
@@ -0,0 +1,114 @@
+from sets import Set
+from inst import Inst
+from dumper import dump
+import sys
+import logging
+
+ops = dict()
+
+def check_operand_types(inst):
+    operand_types = ""
+    operands = inst.operands().split();
+    for operand in operands:
+        key = operand[0]
+        if key == 'R': # Register
+            value = operand[1:]
+            if value == 'Z' or value == 'N' or value == 'M' or \
+                value == 'P' or float(value).is_integer():
+                operand_types += 'R'
+            else:
+                return 'X'
+        elif key == 'P': # Predicate
+            value = operand[1:]
+            if float(value).is_integer():
+                operand_types += 'P'
+            else:
+                return 'X'
+        elif key == 'c': # Constant memory
+            operand_types += 'C'
+        elif key == '[': # Memory
+            operand_types += 'M'
+        elif key == 'S': # Special register
+            operand_types += 'S'
+        else:
+            if len(operand) >= 2 and (operand[0:2] == "0x" or operand[0:3] == "-0x"): # Hex immediate
+                operand_types += 'I'
+            elif float(operand).is_integer(): # Immediate value
+                operand_types += 'I'
+            else:
+                return 'X'
+    if inst.op() not in ops:
+        ops[inst.op()] = set()
+        ops[inst.op()].add(operand_types)
+        return operand_types
+    elif inst.op() in ops and operand_types not in ops[inst.op()]:
+        ops[inst.op()].add(operand_types)
+        return operand_types
+    else:
+        return 'X'
+
+def change(inst, origin):
+    if inst.op() != origin.op():
+        return -1
+    elif inst.modifier() != origin.modifier():
+        return -2
+    else:
+        inst_operands = inst.operands().split()
+        origin_operands = origin.operands().split()
+        for i in range(len(origin_operands)):
+            if (inst_operands[i] != origin_operands[i]):
+                return i
+        return -3
+            
+if __name__ == "__main__":
+    logging.basicConfig(filename = sys.argv[3], level = logging.INFO)
+    logging.debug("argv[1]: Disassemble file")
+    logging.debug("argv[2]: Arch")
+    logging.debug("argv[3]: Output file")
+    logging.debug("argv[4]: Instruction limit (default 100)")
+    sass = sys.argv[1]
+    arch = sys.argv[2]
+    if len(sys.argv) >= 5:
+        limit = sys.argv[4]
+    else:
+        limit = 100
+    count = 0;
+    version = int(arch.split("_")[1])
+    with open(sys.argv[1]) as f:
+        for line in f:
+            pos = []
+            count += 1
+            if count == limit:
+                break
+            line_split = line.split()
+            # Construct instruction structure
+            origin = Inst(line_split)
+            # Find the 64-bit encodings
+            base = int(origin.enc(), 16)
+            origin_operand_types = check_operand_types(origin)
+            if len(origin.operands()) and origin_operand_types.find('X') == -1:
+                pp = [[] for i in range(len(origin_operand_types))]
+                logging.info(origin.op() + " " + origin.modifier())
+                logging.info("0b{:064b}".format(base) + ": " + origin.operands())
+                for i in range(0, 64):
+                    mask = 2**i
+                    newcode = base ^ mask
+                    # Disassemble the new code
+                    dump_file = dump("0x{:016x}".format(newcode), arch)
+                    if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1:
+                        line = dump_file.split("\n")
+                        if version < 40:
+                            line_inst = line[1].split();
+                        else:
+                            line_inst = line[5].split();
+                        # [0]: header info, [1] instruction part
+                        line_inst.pop(0)
+                        inst = Inst(line_inst, raw = version > 40)
+                        pos = change(inst, origin) 
+                        if pos >= 0:
+                            pp[pos].append(i)
+                            logging.info("0b{:064b}".format(newcode) + ": " + inst.operands())
+                logging.info("Operand combination types: %s", origin_operand_types)
+                for i in range(0, len(pp)):
+                    logging.info("Operand type: %s", origin_operand_types[i])
+                    logging.info("Encoding: %s", pp[i])
diff --git a/Solver/src/ptxgen.pl b/Solver/src/ptxgen.pl
new file mode 100644
index 0000000..ce985b5
--- /dev/null
+++ b/Solver/src/ptxgen.pl
@@ -0,0 +1,339 @@
+#!/usr/bin/perl -sw
+# Mass RE tool, generates large amounts of .ptx
+use Data::Dumper;
+use warnings;
+
+sub cartesian{
+    my @C=[];
+    foreach(reverse @_){
+        #$_ is reference, @$_ is an array
+        my @A=@$_;
+        @C=map{my $n=$_; map{[$n,@$_]} @C} @A;
+    }
+    return @C;
+}
+
+sub fprint{
+    my($filename,$content)=@_;
+    return if not ($content);
+    open(FILE, ">".$filename) or die "can't open file: $filename\n";
+    print FILE $content;
+    close FILE;
+}
+
+sub gen_ptx{
+    my $desc = shift;
+    my $code = "";
+    $code.=".version ".$$desc{ver}."\n";
+    $code.=".target ".$$desc{arch}."\n";
+    $code.=".entry bench(.param .u64 I){\n";
+    $code.="    .reg .b64   ptr;\n";
+    $code.="    .reg .pred  pi<".$$desc{pi}.">;\n" if $$desc{pi};
+    $code.="    .reg .pred  po<".$$desc{po}.">;\n" if $$desc{po};
+    foreach my $b (8,16,32,64,128){
+        my $key = "r".$b."i";
+        $code.="    .reg .b".$b."   ".$key."<".$$desc{$key}.">;\n" if $$desc{$key};
+    }
+    foreach my $b (8,16,32,64,128){
+        my $key = "r".$b."o";
+        $code.="    .reg .b".$b."   ".$key."<".$$desc{$key}.">;\n" if $$desc{$key};
+    }
+    $code.="    ld.param.u64 ptr, [I];\n";
+    $code.="    cvta.to.global.u64  ptr, ptr;\n";
+    for(my $i=0;$i < ($$desc{pi}||0); $i++){
+        $code.="    setp.ne.u64     pi$i, ptr, $i;\n";
+    }
+    foreach my $b (8,16,32,64,128){
+        my $key = "r".$b."i";
+        for(my $i=0;$i < ($$desc{$key}||0); $i++){
+            $code.="    ldu.global.b".$b." ".$key.$i.", [ptr+".($i*$b/8)."];\n";
+        }
+    }
+    $code.="    ".$$desc{insn}."\n";
+    foreach my $b (8,16,32,64,128){
+        my $key = "r".$b."o";
+        for(my $i=0;$i < ($$desc{$key}||0); $i++){
+            $code.="    st.global.b".$b." [ptr+".($i*$b/8)."], ".$key.$i.";\n";
+        }
+    }
+    for(my $i=0;$i < ($$desc{"po"}||0); $i++){
+        $code.="    \@po$i st.global.b8 [ptr+".($i*8)."], ".$i.";\n";
+    }
+    $code.="}\n";
+    fprint($$desc{outfile},$code);
+}
+
+my $ver  = "5.0";
+my $arch = shift(@ARGV);
+my $dir  = shift(@ARGV);
+
+#shorthands
+my $us8    = ["u8","s8"];
+my $bus8   = ["b8",@$us8];
+my $us16   = ["u16","s16"];
+my $bus16  = ["b16",@$us16];
+my $us32   = ["u32","s32"];
+my $bus32  = ["b32",@$us32];
+my $fus32  = ["f32",@$us32];
+my $busf32 = ["f32",@$bus32];
+my $us64   = ["u64","s64"];
+my $bus64  = ["b64",@$us64];
+my $busf64 = ["f64",@$bus64];
+my @types  = (@$bus8,@$bus16,@$busf32,@$busf64,"b128");
+my $frnd   = ["rn","rz","rm","rp"];
+my $irnd   = ["rni","rzi","rmi","rpi"];
+my $bcmp   = ["eq", "ne"];
+my $scmp   = ["eq", "ne", "lt", "le", "gt", "ge"];
+my $ucmp   = ["lo", "ls", "hi", "hs"];
+my $fcmp   = ["equ", "neu", "ltu", "leu", "gtu", "geu", "num", "nan"];
+my $bool   = ["and","or","xor"];
+my $ftz    = ["ftz",""];
+my $sat    = ["sat",""];
+my $shclamp= ["clamp","wrap"];
+my $lcop   = ["ca","cg","cs"];
+my $lcopv  = [@$lcop,"lu","cv"];
+my $scop   = ["wb","cg","cs","wt"];
+
+# TODO:
+# try to generate instructions outside PTX ISA (add shl, cmem load-exe)
+# conditional
+# immediates
+# cmem
+# lmem
+# smem
+# ?offsets
+my @entries=(
+    # Integer Arithmetic Instructions
+    [["mul"],["hi","lo"],$us16,["ARGS"],["r16o"],["r16i"],["r16i"]],
+    [["mul","mul24"],["hi","lo"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["mul"],["hi","lo"],$us64,["ARGS"],["r64o"],["r64i"],["r64i"]],
+    [["mul"],["wide"],$us16,["ARGS"],["r32o"],["r16i"],["r16i"]],
+    [["mul"],["wide"],$us32,["ARGS"],["r64o"],["r32i"],["r32i"]],
+    [["mad"],["hi","lo"],$us16,["ARGS"],["r16o"],["r16i"],["r16i"],["r16i"]],
+    [["mad","mad24"],["hi","lo"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["mad","mad24"],["hi"],["sat"],["s32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["mad"],["hi","lo"],$us64,["ARGS"],["r64o"],["r64i"],["r64i"],["r64i"]],
+    [["mad"],["wide"],$us16,["ARGS"],["r32o"],["r16i"],["r16i"],["r32i"]],
+    [["mad"],["wide"],$us32,["ARGS"],["r64o"],["r32i"],["r32i"],["r64i"]],
+    [["sad"],$us16,["ARGS"],["r16o"],["r16i"],["r16i"],["r16i"]],
+    [["sad"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["sad"],$us64,["ARGS"],["r64o"],["r64i"],["r64i"],["r64i"]],
+    [["add","sub","div","rem","min","max"],$us16,["ARGS"],["r16o"],["r16i"],["r16i"]],
+    [["add","sub","div","rem","min","max"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["add","sub","div","rem","min","max"],$us64,["ARGS"],["r64o"],["r64i"],["r64i"]],
+    [["neg","abs"],["s16"],["ARGS"],["r16o"],["r16i"]],
+    [["neg","abs"],["s32"],["ARGS"],["r32o"],["r32i"]],
+    [["neg","abs"],["s64"],["ARGS"],["r64o"],["r64i"]],
+    [["popc"],["b32"],["ARGS"],["r32o"],["r32i"]],
+    [["popc"],["b64"],["ARGS"],["r32o"],["r64i"]],
+    [["clz"],["b32"],["ARGS"],["r32o"],["r32i"]],
+    [["clz"],["b64"],["ARGS"],["r32o"],["r64i"]],
+    [["bfind"],["shiftamt",""],[@$us32],["ARGS"],["r32o"],["r32i"]],
+    [["bfind"],["shiftamt",""],[@$us64],["ARGS"],["r32o"],["r64i"]],
+    [["brev"],["b32"],["ARGS"],["r32o"],["r32i"]],
+    [["brev"],["b64"],["ARGS"],["r64o"],["r64i"]],
+    [["bfe"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["bfe"],$us64,["ARGS"],["r64o"],["r64i"],["r32i"],["r32i"]],
+    [["bfi"],["b32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"],["r32i"]],
+    [["bfi"],["b64"],["ARGS"],["r64o"],["r64i"],["r64i"],["r32i"],["r32i"]],
+    # Extended-Precision Arithmetic Instructions
+    [["add","addc","sub","subc"],["cc",""],$us32,["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["mad","madc"],["hi","lo"],["cc"],$us32,["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    # Floating-Point Instructions
+    [["testp"],["finite","infinite","number","notanumber","normal","subnormal"],["f32"],["ARGS"],["po"],["r32i"]],
+    [["testp"],["finite","infinite","number","notanumber","normal","subnormal"],["f64"],["ARGS"],["po"],["r64i"]],
+    [["copysign"],["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["copysign"],["f64"],["ARGS"],["r64o"],["r64i"],["r64i"]],
+    [["add","sub","mul"],$frnd,$ftz,$sat,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["add","sub","mul"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"],["r64i"]],
+    [["min","max"],$ftz,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["min","max"],["f64"],["ARGS"],["r64o"],["r64i"],["r64i"]],
+    [["fma","mad"],$frnd,$ftz,$sat,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["fma","mad"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"],["r64i"],["r64i"]],
+    [["div"],["approx","full",@$frnd],$ftz,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["div"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"],["r64i"]],
+    [["neg","abs"],$ftz,["f32"],["ARGS"],["r32o"],["r32i"]],
+    [["neg","abs"],["f64"],["ARGS"],["r64o"],["r64i"]],
+    [["rcp","sqrt"],["approx",@$frnd],$ftz,["f32"],["ARGS"],["r32o"],["r32i"]],
+    [["rcp"],$frnd,["ftz"],["f64"],["ARGS"],["r64o"],["r64i"]],
+    [["sqrt"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"]],
+    [["rcp"],["approx"],["ftz"],["f64"],["ARGS"],["r64o"],["r64i"]],
+    [["sqrt"],$frnd,["f64"],["ARGS"],["r64o"],["r64i"]],
+    [["rsqrt","sin","cos","lg2","ex2"],["approx"],$ftz,["f32"],["ARGS"],["r32o"],["r32i"]],
+    [["rsqrt"],["approx"],["f64"],["ARGS"],["r64o"],["r64i"]],
+    # Comparison and Selection Instructions:
+    [["set"],$bcmp,$fus32,["b16"],["ARGS"],["r32o"],["r16i"],["r16i"]],
+    [["set"],$ucmp,$fus32,["u16"],["ARGS"],["r32o"],["r16i"],["r16i"]],
+    [["set"],$scmp,$fus32,["s16"],["ARGS"],["r32o"],["r16i"],["r16i"]],
+    [["set"],$bcmp,$fus32,["b32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["set"],$ucmp,$fus32,["u32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["set"],$scmp,$fus32,["s32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["set"],$fcmp,$ftz,$fus32,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["set"],$bcmp,$fus32,["b64"],["ARGS"],["r32o"],["r64i"],["r64i"]],
+    [["set"],$ucmp,$fus32,["u64"],["ARGS"],["r32o"],["r64i"],["r64i"]],
+    [["set"],$scmp,$fus32,["s64"],["ARGS"],["r32o"],["r64i"],["r64i"]],
+    [["set"],$fcmp,$fus32,["f64"],["ARGS"],["r32o"],["r64i"],["r64i"]],
+    [["set"],$bcmp,$bool,$fus32,["b16"],["ARGS"],["r32o"],["r16i"],["r16i"],["pi"]],
+    [["set"],$ucmp,$bool,$fus32,["u16"],["ARGS"],["r32o"],["r16i"],["r16i"],["pi"]],
+    [["set"],$scmp,$bool,$fus32,["s16"],["ARGS"],["r32o"],["r16i"],["r16i"],["pi"]],
+    [["set"],$bcmp,$bool,$fus32,["b32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]],
+    [["set"],$ucmp,$bool,$fus32,["u32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]],
+    [["set"],$scmp,$bool,$fus32,["s32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]],
+    [["set"],$fcmp,$bool,$ftz,$fus32,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]],
+    [["set"],$bcmp,$bool,$fus32,["b64"],["ARGS"],["r32o"],["r64i"],["r64i"],["pi"]],
+    [["set"],$ucmp,$bool,$fus32,["u64"],["ARGS"],["r32o"],["r64i"],["r64i"],["pi"]],
+    [["set"],$scmp,$bool,$fus32,["s64"],["ARGS"],["r32o"],["r64i"],["r64i"],["pi"]],
+    [["set"],$fcmp,$bool,$fus32,["f64"],["ARGS"],["r32o"],["r64i"],["r64i"],["pi"]],
+    [["setp"],$bcmp,["b16"],["ARGS"],["po"],["r16i"],["r16i"]],
+    [["setp"],$ucmp,["u16"],["ARGS"],["po"],["r16i"],["r16i"]],
+    [["setp"],$scmp,["s16"],["ARGS"],["po"],["r16i"],["r16i"]],
+    [["setp"],$bcmp,["b32"],["ARGS"],["po"],["r32i"],["r32i"]],
+    [["setp"],$ucmp,["u32"],["ARGS"],["po"],["r32i"],["r32i"]],
+    [["setp"],$scmp,["s32"],["ARGS"],["po"],["r32i"],["r32i"]],
+    [["setp"],$fcmp,$ftz,["f32"],["ARGS"],["po"],["r32i"],["r32i"]],
+    [["setp"],$bcmp,["b64"],["ARGS"],["po"],["r64i"],["r64i"]],
+    [["setp"],$ucmp,["u64"],["ARGS"],["po"],["r64i"],["r64i"]],
+    [["setp"],$scmp,["s64"],["ARGS"],["po"],["r64i"],["r64i"]],
+    [["setp"],$fcmp,["f64"],["ARGS"],["po"],["r64i"],["r64i"]],
+    [["setp"],$bcmp,$bool,["b16"],["ARGS"],["po"],["r16i"],["r16i"],["pi"]],
+    [["setp"],$ucmp,$bool,["u16"],["ARGS"],["po"],["r16i"],["r16i"],["pi"]],
+    [["setp"],$scmp,$bool,["s16"],["ARGS"],["po"],["r16i"],["r16i"],["pi"]],
+    [["setp"],$bcmp,$bool,["b32"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]],
+    [["setp"],$ucmp,$bool,["u32"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]],
+    [["setp"],$scmp,$bool,["s32"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]],
+    [["setp"],$fcmp,$bool,$ftz,["f32"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]],
+    [["setp"],$bcmp,$bool,["b64"],["ARGS"],["po"],["r64i"],["r64i"],["pi"]],
+    [["setp"],$ucmp,$bool,["u64"],["ARGS"],["po"],["r64i"],["r64i"],["pi"]],
+    [["setp"],$scmp,$bool,["s64"],["ARGS"],["po"],["r64i"],["r64i"],["pi"]],
+    [["setp"],$fcmp,$bool,["f64"],["ARGS"],["po"],["r64i"],["r64i"],["pi"]],
+    [["selp"],$bus16,["ARGS"],["r16o"],["r16i"],["r16i"],["pi"]],
+    [["selp"],$busf32,["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]],
+    [["selp"],$busf64,["ARGS"],["r64o"],["r64i"],["r64i"],["pi"]],
+    [["slct"],$bus16,["s32"],["ARGS"],["r16o"],["r16i"],["r16i"],["r32i"]],
+    [["slct"],$ftz,$bus16,["f32"],["ARGS"],["r16o"],["r16i"],["r16i"],["r32i"]],
+    [["slct"],$busf32,["s32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["slct"],$ftz,$busf32,["f32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["slct"],$busf64,["s32"],["ARGS"],["r64o"],["r64i"],["r64i"],["r32i"]],
+    [["slct"],$ftz,$busf64,["f32"],["ARGS"],["r64o"],["r64i"],["r64i"],["r32i"]],
+    # Logic and Shift Instructions
+    [$bool,["pred"],["ARGS"],["po"],["pi"],["pi"]],
+    [$bool,["b16"],["ARGS"],["r16o"],["r16i"],["r16i"]],
+    [$bool,["b32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [$bool,["b64"],["ARGS"],["r64o"],["r64i"],["r64i"]],
+    [["not","cnot"],["b16"],["ARGS"],["r16o"],["r16i"]],
+    [["not","cnot"],["b32"],["ARGS"],["r32o"],["r32i"]],
+    [["not","cnot"],["b64"],["ARGS"],["r64o"],["r64i"]],
+    [["shf"],["l","r"],$shclamp,["b32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["shl"],["b16"],["ARGS"],["r16o"],["r16i"],["r32i"]],
+    [["shl"],["b32"],["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["shl"],["b64"],["ARGS"],["r64o"],["r64i"],["r32i"]],
+    [["shr"],$bus16,["ARGS"],["r16o"],["r16i"],["r32i"]],
+    [["shr"],$bus32,["ARGS"],["r32o"],["r32i"],["r32i"]],
+    [["shr"],$bus64,["ARGS"],["r64o"],["r64i"],["r32i"]],
+    # Data Movement and Conversion Instructions
+    [["mov"],["pred"],["ARGS"],["po"],["pi"]],
+    [["mov"],$bus16,["ARGS"],["r16o"],["r16i"]],
+    [["mov"],$bus32,["ARGS"],["r32o"],["r32i"]],
+    [["mov"],$busf32,["ARGS"],["r32o"],["r32i"]],
+    [["mov"],$busf64,["ARGS"],["r64o"],["r64i"]],
+    # TODO: vector, sreg
+    [["shfl"],["up","down","bfly","idx"],["b32"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["prmt"],["b32"],["f4e","b4e","rc8","ecl","ecr","rc16"],["ARGS"],["r32o"],["r32i"],["r32i"],["r32i"]],
+    [["ld"],["const","global","local","param","shared"],$lcopv,$bus8,["ARGS"],["r8o"],["ptr"]],
+    [["ld"],["const","global","local","param","shared"],$lcopv,$bus16,["ARGS"],["r16o"],["ptr"]],
+    [["ld"],["const","global","local","param","shared"],$lcopv,$busf32,["ARGS"],["r32o"],["ptr"]],
+    [["ld"],["const","global","local","param","shared"],$lcopv,$busf64,["ARGS"],["r64o"],["ptr"]],
+    [["ld"],["volatile"],["global","shared"],$bus8,["ARGS"],["r8o"],["ptr"]],
+    [["ld"],["volatile"],["global","shared"],$bus16,["ARGS"],["r16o"],["ptr"]],
+    [["ld"],["volatile"],["global","shared"],$busf32,["ARGS"],["r32o"],["ptr"]],
+    [["ld"],["volatile"],["global","shared"],$busf64,["ARGS"],["r64o"],["ptr"]],
+    # TODO: vector ld
+    [["ld"],["global"],$lcop,["nc"],$bus8,["ARGS"],["r8o"],["ptr"]],
+    [["ld"],["global"],$lcop,["nc"],$bus16,["ARGS"],["r16o"],["ptr"]],
+    [["ld"],["global"],$lcop,["nc"],$busf32,["ARGS"],["r32o"],["ptr"]],
+    [["ld"],["global"],$lcop,["nc"],$busf64,["ARGS"],["r64o"],["ptr"]],
+    [["ldu"],["global"],$bus8,["ARGS"],["r8o"],["ptr"]],
+    [["ldu"],["global"],$bus16,["ARGS"],["r16o"],["ptr"]],
+    [["ldu"],["global"],$busf32,["ARGS"],["r32o"],["ptr"]],
+    [["ldu"],["global"],$busf64,["ARGS"],["r64o"],["ptr"]],
+    [["st"],["global","local","param","shared"],$scop,$bus8,["ARGS"],["ptr"],["r8i"]],
+    [["st"],["global","local","param","shared"],$scop,$bus16,["ARGS"],["ptr"],["r16i"]],
+    [["st"],["global","local","param","shared"],$scop,$busf32,["ARGS"],["ptr"],["r32i"]],
+    [["st"],["global","local","param","shared"],$scop,$busf64,["ARGS"],["ptr"],["r64i"]],
+    [["st"],["global","local","param","shared"],$scop,$bus8,["ARGS"],["ptr"],["r8i"]],
+    [["st"],["volatile"],["global","shared"],$bus8,["ARGS"],["ptr"],["r8i"]],
+    [["st"],["volatile"],["global","shared"],$bus16,["ARGS"],["ptr"],["r16i"]],
+    [["st"],["volatile"],["global","shared"],$busf32,["ARGS"],["ptr"],["r32i"]],
+    [["st"],["volatile"],["global","shared"],$busf64,["ARGS"],["ptr"],["r64i"]],
+    [["prefetch"],["L1","L2"],["global","local"],["ARGS"],["ptr"]],
+    [["prefetchu"],["L1"],["ARGS"],["ptr"]],
+    [["isspacep"],["global","local","const","shared"],["ARGS"],["po"],["r32i"]],
+    [["cvta"],["global","local","const","shared"],["u32"],["ARGS"],["r32o"],["r32i"]],
+    [["cvta"],["global","local","const","shared"],["u64"],["ARGS"],["r64o"],["r64i"]],
+    # NOTE: skipped most of cvts
+    [["cvt"],$us8,$us32,["ARGS"],["r8o"],["r32i"]],
+    [["cvt"],$irnd,$ftz,$sat,$us32,["f32"],["ARGS"],["r32o"],["r32i"]],
+    [["cvt"],$frnd,$ftz,$sat,["f32"],$us32,["ARGS"],["r32o"],["r32i"]],
+    # TODO: Texture Instructions
+    # TODO: Surface Instructions
+    # TODO: Control Flow Instructions
+    # Parallel Synchronization and Communication Instructions
+    [["bar"],["sync","arrive"],["ARGS"],["r32o"],["r32i"]],
+    [["bar"],["red"],["popc"],["u32"],["ARGS"],["r32o"],["r32i"],["r32i"],["pi"]],
+    [["bar"],["red"],["and","or"],["pred"],["ARGS"],["po"],["r32i"],["r32i"],["pi"]],
+    [["membar"],["cta","gl","sys"]],
+    [["atom"],["global","shared"],["and","or","xor","exch"],["b32"],["ARGS"],["r32o"],["ptr"],["r32i"]],
+    [["atom"],["global","shared"],["cas"],["b32"],["ARGS"],["r32o"],["ptr"],["r32i"],["r32i"]],
+    [["atom"],["global","shared"],["add"],$fus32,["ARGS"],["r32o"],["ptr"],["r32i"]],
+    [["atom"],["global","shared"],["inc","dec"],["u32"],["ARGS"],["r32o"],["ptr"],["r32i"]],
+    [["atom"],["global","shared"],["min","max"],$us32,["ARGS"],["r32o"],["ptr"],["r32i"]],
+    [["atom"],["global","shared"],["and","or","xor","exch"],["b64"],["ARGS"],["r64o"],["ptr"],["r64i"]],
+    [["atom"],["global","shared"],["cas"],["b64"],["ARGS"],["r64o"],["ptr"],["r64i"],["r64i"]],
+    [["atom"],["global","shared"],["min","max"],$us64,["ARGS"],["r64o"],["ptr"],["r64i"]],
+    [["atom"],["global","shared"],["add"],["u64"],["ARGS"],["r64o"],["ptr"],["r64i"]],
+    [["vote"],["all","any","uni"],["pred"],["ARGS"],["po"],["pi"]],
+    [["vote"],["ballot"],["b32"],["ARGS"],["r32o"],["pi"]],
+);
+
+foreach $entry (@entries){
+    my @op_descs = cartesian(@$entry);
+    foreach $op_desc (@op_descs){
+        my $name = "";
+        my $insn = "";
+        my %desc = (ver=>$ver,arch=>$arch,dir=>$dir);
+        my $args = 0;
+        foreach $field (@$op_desc){
+            next if $field eq "";
+            if($field eq "ARGS"){
+                chop($insn); # chop . at the end of $insn
+                $args = 1;
+                next;
+            }
+            foreach (@types){
+                if ($_ eq $field){
+                    $desc{type}=$field;
+                    last;
+                }
+            }
+            $name.=$field."_";
+            if($args){
+                if($field eq "ptr"){
+                    $insn.=" [ptr],";
+                }
+                else{
+                    $insn.=" ".$field.($desc{$field}++).",";
+                }
+            }
+            else{
+                $insn.=$field.".";
+            }
+        }
+        chop($name);
+        $name.=".ptx";
+        chop($insn);
+        $insn.=";";
+        $desc{insn}=$insn;
+        $desc{outfile}=$dir.$name;
+        gen_ptx(\%desc);
+    }
+}
diff --git a/Solver/src/test.cu b/Solver/src/test.cu
new file mode 100644
index 0000000..cf0a568
--- /dev/null
+++ b/Solver/src/test.cu
@@ -0,0 +1,4 @@
+__global__ void test(float& a, float& b) {
+  do {
+  } while(1);
+}
diff --git a/Solver/src/unique.py b/Solver/src/unique.py
new file mode 100644
index 0000000..36df1a7
--- /dev/null
+++ b/Solver/src/unique.py
@@ -0,0 +1,14 @@
+from sets import Set
+from inst import Inst
+import subprocess 
+import sys
+
+if __name__ == "__main__":
+    opset = Set([])
+    with open(sys.argv[1]) as f:
+        for line in f:
+            field = line.split()
+            inst = Inst(field, False)
+            if not inst.op() in opset:
+                opset.add(inst.op())
+                sys.stdout.write(line)